diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-64/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/chat_template.jinja b/dapo_lorafa_20251202_173337/checkpoint-576/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/special_tokens_map.json b/dapo_lorafa_20251202_173337/checkpoint-576/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json b/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0d4a0dcee8438cb49e7c5f2a024517ecf5b0125c --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/trainer_state.json @@ -0,0 +1,17890 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5298988040478381, + "eval_steps": 500, + "global_step": 576, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025743793230503798, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.920872470393078e-06, + "clip_ratio/high_mean": 1.2302181175982696e-06, + "clip_ratio/low_mean": 2.9912232776041492e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1142450779952924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14377.0, + "completions/max_terminated_length": 14377.0, + "completions/mean_length": 4861.1796875, + "completions/mean_terminated_length": 4861.1796875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0784558206796646, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023554943036288023, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 1437829.0, + "reward": 0.3515625, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.00045694064465351403, + "sampling/sampling_logp_difference/max": 7.690957069396973, + "sampling/sampling_logp_difference/mean": 0.018809247761964798, + "step": 2 + }, + { + "clip_ratio/high_max": 1.673043971095467e-05, + "clip_ratio/high_mean": 4.8752071961644106e-06, + "clip_ratio/low_mean": 2.1540331545111258e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6415538741275668e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15479.0, + "completions/mean_length": 6167.5078125, + "completions/mean_terminated_length": 5922.3125, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "entropy": 1.1373522356152534, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002558506093919277, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 2245838.0, + "reward": 0.296875, + "reward_std": 0.2669745087623596, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000486373901367, + "sampling/importance_sampling_ratio/min": 2.8637201467063278e-05, + "sampling/sampling_logp_difference/max": 10.460803985595703, + "sampling/sampling_logp_difference/mean": 0.02123238891363144, + "step": 3 + }, + { + "clip_ratio/high_max": 4.3118818666698644e-05, + "clip_ratio/high_mean": 1.0779704666674661e-05, + "clip_ratio/low_mean": 3.257358957853285e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.335329458626802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 5691.9296875, + "completions/mean_terminated_length": 5435.3203125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "entropy": 1.1964457035064697, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001936351996846497, + "learning_rate": 1e-05, + "loss": 0.0366, + "num_tokens": 2998805.0, + "reward": 0.3046875, + "reward_std": 0.2727435827255249, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518990516663, + "sampling/importance_sampling_ratio/min": 9.316575415141415e-06, + "sampling/sampling_logp_difference/max": 11.583715438842773, + "sampling/sampling_logp_difference/mean": 0.021076630800962448, + "step": 4 + }, + { + "clip_ratio/high_max": 1.666655725784949e-05, + "clip_ratio/high_mean": 4.1666393144623726e-06, + "clip_ratio/low_mean": 2.0471738594096678e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4638378022245888e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 5535.828125, + "completions/mean_terminated_length": 5535.828125, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 1.0935996025800705, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003226158209145069, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 3727959.0, + "reward": 0.3046875, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 5.9354013501433656e-05, + "sampling/sampling_logp_difference/max": 9.731990814208984, + "sampling/sampling_logp_difference/mean": 0.019589610397815704, + "step": 5 + }, + { + "clip_ratio/high_max": 1.9090986825176515e-05, + "clip_ratio/high_mean": 4.772746706294129e-06, + "clip_ratio/low_mean": 1.995503203033877e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4727778054511873e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14820.0, + "completions/mean_length": 4552.9296875, + "completions/mean_terminated_length": 4459.771484375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9019740223884583, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002911025658249855, + "learning_rate": 1e-05, + "loss": 0.0742, + "num_tokens": 4329342.0, + "reward": 0.4375, + "reward_std": 0.3448186218738556, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999092817306519, + "sampling/importance_sampling_ratio/min": 0.0010333366226404905, + "sampling/sampling_logp_difference/max": 6.874962329864502, + "sampling/sampling_logp_difference/mean": 0.01768551766872406, + "step": 6 + }, + { + "clip_ratio/high_max": 9.186584293274791e-06, + "clip_ratio/high_mean": 2.2966460733186977e-06, + "clip_ratio/low_mean": 1.9561108047128073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.185775372254284e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14197.0, + "completions/mean_length": 5849.4921875, + "completions/mean_terminated_length": 5682.2783203125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1362405940890312, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018774238415062428, + "learning_rate": 1e-05, + "loss": 0.0106, + "num_tokens": 5097245.0, + "reward": 0.1953125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999154210090637, + "sampling/importance_sampling_ratio/min": 0.00020401047368068248, + "sampling/sampling_logp_difference/max": 8.497339248657227, + "sampling/sampling_logp_difference/mean": 0.020379718393087387, + "step": 7 + }, + { + "clip_ratio/high_max": 7.997417014848907e-06, + "clip_ratio/high_mean": 1.9993542537122266e-06, + "clip_ratio/low_mean": 4.003535150332027e-05, + "clip_ratio/low_min": 4.32017714047106e-06, + "clip_ratio/region_mean": 4.203470598440617e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16045.0, + "completions/mean_length": 5744.6796875, + "completions/mean_terminated_length": 5575.8017578125, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "entropy": 0.989105150103569, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0025437718722969294, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 5851844.0, + "reward": 0.375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 4.312803503125906e-05, + "sampling/sampling_logp_difference/max": 10.051337242126465, + "sampling/sampling_logp_difference/mean": 0.020163267850875854, + "step": 8 + }, + { + "clip_ratio/high_max": 5.422758022177732e-06, + "clip_ratio/high_mean": 1.355689505544433e-06, + "clip_ratio/low_mean": 3.697482691222831e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833051641777274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15874.0, + "completions/mean_length": 4075.9609375, + "completions/mean_terminated_length": 3979.047119140625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.8887222409248352, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024127138312906027, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 6392287.0, + "reward": 0.4140625, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527335166931, + "sampling/importance_sampling_ratio/min": 4.007668394478969e-05, + "sampling/sampling_logp_difference/max": 10.124715805053711, + "sampling/sampling_logp_difference/mean": 0.017202626913785934, + "step": 9 + }, + { + "clip_ratio/high_max": 1.9414138932916103e-05, + "clip_ratio/high_mean": 5.8681449672803865e-06, + "clip_ratio/low_mean": 4.918625745631289e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.5054402309906436e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15691.0, + "completions/mean_length": 5248.3984375, + "completions/mean_terminated_length": 4981.14404296875, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.7111036106944084, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0028383845929056406, + "learning_rate": 1e-05, + "loss": 0.1027, + "num_tokens": 7081234.0, + "reward": 0.5625, + "reward_std": 0.4150439500808716, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589323997498, + "sampling/importance_sampling_ratio/min": 0.00037057927693240345, + "sampling/sampling_logp_difference/max": 7.900443077087402, + "sampling/sampling_logp_difference/mean": 0.01570993661880493, + "step": 10 + }, + { + "clip_ratio/high_max": 7.0035857788752764e-06, + "clip_ratio/high_mean": 1.7508964447188191e-06, + "clip_ratio/low_mean": 1.4078211620471848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5829108065190667e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16172.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 4956.6015625, + "completions/mean_terminated_length": 4956.6015625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "entropy": 1.026921771466732, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001392067177221179, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 7735511.0, + "reward": 0.328125, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.00033587991492822766, + "sampling/sampling_logp_difference/max": 7.9987568855285645, + "sampling/sampling_logp_difference/mean": 0.019166938960552216, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9272594929352636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9272594929352636e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16229.0, + "completions/mean_length": 5858.953125, + "completions/mean_terminated_length": 5691.88916015625, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 1.1407905519008636, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018726681591942906, + "learning_rate": 1e-05, + "loss": 0.092, + "num_tokens": 8506089.0, + "reward": 0.25, + "reward_std": 0.2829982340335846, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998714327812195, + "sampling/importance_sampling_ratio/min": 2.4313605536008254e-05, + "sampling/sampling_logp_difference/max": 10.62447452545166, + "sampling/sampling_logp_difference/mean": 0.020790230482816696, + "step": 12 + }, + { + "clip_ratio/high_max": 4.318236733524827e-06, + "clip_ratio/high_mean": 1.0795591833812068e-06, + "clip_ratio/low_mean": 3.3191785689723474e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.427134498679152e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15040.0, + "completions/mean_length": 6801.09375, + "completions/mean_terminated_length": 6571.1044921875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 1.185454584658146, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031675526406615973, + "learning_rate": 1e-05, + "loss": 0.0244, + "num_tokens": 9398597.0, + "reward": 0.21875, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000109672546387, + "sampling/importance_sampling_ratio/min": 0.0010334982071071863, + "sampling/sampling_logp_difference/max": 6.874805927276611, + "sampling/sampling_logp_difference/mean": 0.021565770730376244, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3892819879401941e-05, + "clip_ratio/high_mean": 3.4732049698504852e-06, + "clip_ratio/low_mean": 2.9275798283379117e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2749003707976954e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15496.0, + "completions/mean_length": 4673.578125, + "completions/mean_terminated_length": 4581.3701171875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9907316789031029, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024632434360682964, + "learning_rate": 1e-05, + "loss": 0.0147, + "num_tokens": 10016559.0, + "reward": 0.3046875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000066757202148, + "sampling/importance_sampling_ratio/min": 0.001339821144938469, + "sampling/sampling_logp_difference/max": 6.6152191162109375, + "sampling/sampling_logp_difference/mean": 0.019262395799160004, + "step": 14 + }, + { + "clip_ratio/high_max": 1.6510958175786072e-05, + "clip_ratio/high_mean": 4.127739543946518e-06, + "clip_ratio/low_mean": 1.770910688492222e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1836846656242415e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 4617.4140625, + "completions/mean_terminated_length": 4524.763671875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 1.100720427930355, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032584660220891237, + "learning_rate": 1e-05, + "loss": 0.0047, + "num_tokens": 10628084.0, + "reward": 0.375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999375343322754, + "sampling/importance_sampling_ratio/min": 4.245261607138673e-06, + "sampling/sampling_logp_difference/max": 12.369707107543945, + "sampling/sampling_logp_difference/mean": 0.019928477704524994, + "step": 15 + }, + { + "clip_ratio/high_max": 9.921910532284528e-06, + "clip_ratio/high_mean": 3.5021869280171813e-06, + "clip_ratio/low_mean": 1.4621458831243217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.81236457592604e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13925.0, + "completions/mean_length": 5611.5625, + "completions/mean_terminated_length": 5353.0244140625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 1.0112926587462425, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001977710286155343, + "learning_rate": 1e-05, + "loss": -0.0229, + "num_tokens": 11364332.0, + "reward": 0.2109375, + "reward_std": 0.21146979928016663, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999548196792603, + "sampling/importance_sampling_ratio/min": 4.5400451199384406e-05, + "sampling/sampling_logp_difference/max": 9.999988555908203, + "sampling/sampling_logp_difference/mean": 0.019674532115459442, + "step": 16 + }, + { + "clip_ratio/high_max": 8.318262189277448e-06, + "clip_ratio/high_mean": 2.079565547319362e-06, + "clip_ratio/low_mean": 3.345101845297904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5530583886611566e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14463.0, + "completions/mean_length": 5321.7578125, + "completions/mean_terminated_length": 5234.6533203125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.9611762389540672, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002321678213775158, + "learning_rate": 1e-05, + "loss": 0.0089, + "num_tokens": 12067365.0, + "reward": 0.2734375, + "reward_std": 0.22225630283355713, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 5.329983650881331e-06, + "sampling/sampling_logp_difference/max": 12.142162322998047, + "sampling/sampling_logp_difference/mean": 0.019090529531240463, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.1286541861372825e-05, + "clip_ratio/low_min": 4.589008312905207e-06, + "clip_ratio/region_mean": 5.1286541861372825e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15906.0, + "completions/mean_length": 6747.8125, + "completions/mean_terminated_length": 6516.54443359375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.8531035929918289, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003335036803036928, + "learning_rate": 1e-05, + "loss": 0.0494, + "num_tokens": 12950989.0, + "reward": 0.3515625, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999262690544128, + "sampling/importance_sampling_ratio/min": 0.0024787711445242167, + "sampling/sampling_logp_difference/max": 5.999992370605469, + "sampling/sampling_logp_difference/mean": 0.017946189269423485, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.059201583255344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.059201583255344e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14653.0, + "completions/mean_length": 5237.5390625, + "completions/mean_terminated_length": 5060.611328125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.9604798555374146, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028048555832356215, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 13641594.0, + "reward": 0.3359375, + "reward_std": 0.27851757407188416, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999921977519989, + "sampling/importance_sampling_ratio/min": 0.0003354719083290547, + "sampling/sampling_logp_difference/max": 7.999972343444824, + "sampling/sampling_logp_difference/mean": 0.01799672283232212, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7391609592086752e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7391609592086752e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14949.0, + "completions/mean_length": 5088.71875, + "completions/mean_terminated_length": 4999.779296875, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.9381079524755478, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015588597161695361, + "learning_rate": 1e-05, + "loss": 0.0593, + "num_tokens": 14310022.0, + "reward": 0.3515625, + "reward_std": 0.24723157286643982, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999968945980072, + "sampling/importance_sampling_ratio/min": 0.0008060967666096985, + "sampling/sampling_logp_difference/max": 7.123306751251221, + "sampling/sampling_logp_difference/mean": 0.018512990325689316, + "step": 20 + }, + { + "clip_ratio/high_max": 1.4323140021588188e-05, + "clip_ratio/high_mean": 3.580785005397047e-06, + "clip_ratio/low_mean": 2.3172296550910687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6753081669994572e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15268.0, + "completions/max_terminated_length": 15268.0, + "completions/mean_length": 5374.375, + "completions/mean_terminated_length": 5374.375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 1.198778212070465, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023761435877531767, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 15017710.0, + "reward": 0.21875, + "reward_std": 0.2432974874973297, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000046730041504, + "sampling/importance_sampling_ratio/min": 2.2531810827786103e-05, + "sampling/sampling_logp_difference/max": 10.700582504272461, + "sampling/sampling_logp_difference/mean": 0.02083735726773739, + "step": 21 + }, + { + "clip_ratio/high_max": 8.891734069038648e-06, + "clip_ratio/high_mean": 2.222933517259662e-06, + "clip_ratio/low_mean": 3.576970004814939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.799263345172221e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16131.0, + "completions/max_terminated_length": 16131.0, + "completions/mean_length": 5016.484375, + "completions/mean_terminated_length": 5016.484375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 1.0073698610067368, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024441592395305634, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 15680364.0, + "reward": 0.2734375, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.0009118849993683398, + "sampling/sampling_logp_difference/max": 6.999996662139893, + "sampling/sampling_logp_difference/mean": 0.019295595586299896, + "step": 22 + }, + { + "clip_ratio/high_max": 7.065739737299737e-06, + "clip_ratio/high_mean": 1.7664349343249341e-06, + "clip_ratio/low_mean": 4.2640075662347954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.440651059667289e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14751.0, + "completions/mean_length": 6798.171875, + "completions/mean_terminated_length": 6408.50390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0817051529884338, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035431634169071913, + "learning_rate": 1e-05, + "loss": -0.0282, + "num_tokens": 16572210.0, + "reward": 0.3046875, + "reward_std": 0.3645517826080322, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 0.00014901062240824103, + "sampling/sampling_logp_difference/max": 8.811492919921875, + "sampling/sampling_logp_difference/mean": 0.021285930648446083, + "step": 23 + }, + { + "clip_ratio/high_max": 1.8304424429516075e-05, + "clip_ratio/high_mean": 4.576106107379019e-06, + "clip_ratio/low_mean": 3.600540730985813e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0581513530923985e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14866.0, + "completions/mean_length": 5388.6875, + "completions/mean_terminated_length": 5302.1103515625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 1.1402523145079613, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003914100583642721, + "learning_rate": 1e-05, + "loss": 0.0017, + "num_tokens": 17282394.0, + "reward": 0.234375, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000356435775757, + "sampling/importance_sampling_ratio/min": 4.936015557177598e-06, + "sampling/sampling_logp_difference/max": 12.218952178955078, + "sampling/sampling_logp_difference/mean": 0.020141229033470154, + "step": 24 + }, + { + "clip_ratio/high_max": 3.6923258903698297e-06, + "clip_ratio/high_mean": 9.230814725924574e-07, + "clip_ratio/low_mean": 4.0747915363681386e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1670996779430425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15553.0, + "completions/mean_length": 5140.625, + "completions/mean_terminated_length": 4962.1591796875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 0.9437280669808388, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026927352882921696, + "learning_rate": 1e-05, + "loss": 0.0467, + "num_tokens": 17963970.0, + "reward": 0.3125, + "reward_std": 0.3009189963340759, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961256980896, + "sampling/importance_sampling_ratio/min": 6.243770621949807e-05, + "sampling/sampling_logp_difference/max": 9.681341171264648, + "sampling/sampling_logp_difference/mean": 0.02010953240096569, + "step": 25 + }, + { + "clip_ratio/high_max": 9.832700470724376e-06, + "clip_ratio/high_mean": 2.458175117681094e-06, + "clip_ratio/low_mean": 1.5558874792986899e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014062596979784e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12937.0, + "completions/max_terminated_length": 12937.0, + "completions/mean_length": 5454.8515625, + "completions/mean_terminated_length": 5454.8515625, + "completions/min_length": 717.0, + "completions/min_terminated_length": 717.0, + "entropy": 1.1385098099708557, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027164353523403406, + "learning_rate": 1e-05, + "loss": 0.009, + "num_tokens": 18680591.0, + "reward": 0.296875, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000078558921814, + "sampling/importance_sampling_ratio/min": 0.005307729355990887, + "sampling/sampling_logp_difference/max": 5.238591194152832, + "sampling/sampling_logp_difference/mean": 0.020798511803150177, + "step": 26 + }, + { + "clip_ratio/high_max": 1.8564560832601273e-05, + "clip_ratio/high_mean": 4.641140208150318e-06, + "clip_ratio/low_mean": 1.8977171066580922e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.361831138841808e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15341.0, + "completions/mean_length": 6053.4296875, + "completions/mean_terminated_length": 5972.08642578125, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "entropy": 1.006893776357174, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016045555239543319, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 19474438.0, + "reward": 0.2578125, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 4.606551374308765e-05, + "sampling/sampling_logp_difference/max": 9.985445976257324, + "sampling/sampling_logp_difference/mean": 0.01937020570039749, + "step": 27 + }, + { + "clip_ratio/high_max": 3.951194685214432e-06, + "clip_ratio/high_mean": 9.87798671303608e-07, + "clip_ratio/low_mean": 3.949826844973359e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.048606700735036e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 5732.6328125, + "completions/mean_terminated_length": 5563.56396484375, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "entropy": 1.0205800458788872, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017355874879285693, + "learning_rate": 1e-05, + "loss": 0.0254, + "num_tokens": 20229199.0, + "reward": 0.2578125, + "reward_std": 0.32695505023002625, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966561794281, + "sampling/importance_sampling_ratio/min": 9.611312270862982e-05, + "sampling/sampling_logp_difference/max": 9.249984741210938, + "sampling/sampling_logp_difference/mean": 0.020152747631072998, + "step": 28 + }, + { + "clip_ratio/high_max": 1.1344701988491579e-05, + "clip_ratio/high_mean": 2.8361754971228947e-06, + "clip_ratio/low_mean": 6.441893049213832e-05, + "clip_ratio/low_min": 3.704581786223571e-06, + "clip_ratio/region_mean": 6.72551062734783e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11633.0, + "completions/mean_length": 4968.0546875, + "completions/mean_terminated_length": 4786.849609375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 1.0484329834580421, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002361088991165161, + "learning_rate": 1e-05, + "loss": 0.1348, + "num_tokens": 20885790.0, + "reward": 0.265625, + "reward_std": 0.3180084228515625, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000216960906982, + "sampling/importance_sampling_ratio/min": 0.006972009316086769, + "sampling/sampling_logp_difference/max": 4.965851783752441, + "sampling/sampling_logp_difference/mean": 0.018748482689261436, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.939045106766571e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.939045106766571e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12655.0, + "completions/mean_length": 4634.640625, + "completions/mean_terminated_length": 4542.1259765625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.0479918718338013, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002287437906488776, + "learning_rate": 1e-05, + "loss": -0.0157, + "num_tokens": 21497480.0, + "reward": 0.34375, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999415874481201, + "sampling/importance_sampling_ratio/min": 8.729176670385641e-07, + "sampling/sampling_logp_difference/max": 13.951424598693848, + "sampling/sampling_logp_difference/mean": 0.019327208399772644, + "step": 30 + }, + { + "clip_ratio/high_max": 2.4600531105534174e-05, + "clip_ratio/high_mean": 7.4163915542158065e-06, + "clip_ratio/low_mean": 3.8106682723082486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.552307382255094e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15999.0, + "completions/mean_length": 5922.8359375, + "completions/mean_terminated_length": 5840.46435546875, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 1.1925376057624817, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002192641608417034, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 22276267.0, + "reward": 0.1953125, + "reward_std": 0.22461041808128357, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999226987361908, + "sampling/importance_sampling_ratio/min": 1.546916053030145e-07, + "sampling/sampling_logp_difference/max": 15.681832313537598, + "sampling/sampling_logp_difference/mean": 0.026596486568450928, + "step": 31 + }, + { + "clip_ratio/high_max": 1.3442309864331037e-05, + "clip_ratio/high_mean": 3.360577466082759e-06, + "clip_ratio/low_mean": 2.185166863455379e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5212245873262873e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15082.0, + "completions/mean_length": 5835.5, + "completions/mean_terminated_length": 5752.44091796875, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 1.229158878326416, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0007279868004843593, + "learning_rate": 1e-05, + "loss": 0.0081, + "num_tokens": 23044019.0, + "reward": 0.1796875, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998399019241333, + "sampling/importance_sampling_ratio/min": 1.414701245039396e-07, + "sampling/sampling_logp_difference/max": 15.771177291870117, + "sampling/sampling_logp_difference/mean": 0.020945575088262558, + "step": 32 + }, + { + "clip_ratio/high_max": 1.277465526072774e-05, + "clip_ratio/high_mean": 3.193663815181935e-06, + "clip_ratio/low_mean": 3.348547249970579e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.667913586014038e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14729.0, + "completions/max_terminated_length": 14729.0, + "completions/mean_length": 5070.1484375, + "completions/mean_terminated_length": 5070.1484375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 1.0323031097650528, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022168844006955624, + "learning_rate": 1e-05, + "loss": 0.0657, + "num_tokens": 23714878.0, + "reward": 0.3515625, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999499917030334, + "sampling/importance_sampling_ratio/min": 0.0037885017227381468, + "sampling/sampling_logp_difference/max": 5.575784683227539, + "sampling/sampling_logp_difference/mean": 0.01919984258711338, + "step": 33 + }, + { + "clip_ratio/high_max": 1.2069132026226725e-05, + "clip_ratio/high_mean": 3.0172830065566814e-06, + "clip_ratio/low_mean": 3.323697501400602e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6254257338441676e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15334.0, + "completions/mean_length": 4792.2578125, + "completions/mean_terminated_length": 4700.984375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.9981634542346001, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001841123914346099, + "learning_rate": 1e-05, + "loss": 0.0577, + "num_tokens": 24347119.0, + "reward": 0.4375, + "reward_std": 0.3524719774723053, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999489784240723, + "sampling/importance_sampling_ratio/min": 4.2607393879734445e-06, + "sampling/sampling_logp_difference/max": 12.366067886352539, + "sampling/sampling_logp_difference/mean": 0.018039174377918243, + "step": 34 + }, + { + "clip_ratio/high_max": 1.3947896150057204e-05, + "clip_ratio/high_mean": 4.6235029458330246e-06, + "clip_ratio/low_mean": 4.1055162455450045e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5678665628656745e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16032.0, + "completions/mean_length": 6841.375, + "completions/mean_terminated_length": 6453.46337890625, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "entropy": 1.0972845032811165, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00202017929404974, + "learning_rate": 1e-05, + "loss": -0.0092, + "num_tokens": 25241911.0, + "reward": 0.25, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999304413795471, + "sampling/importance_sampling_ratio/min": 0.00026355183217674494, + "sampling/sampling_logp_difference/max": 8.241260528564453, + "sampling/sampling_logp_difference/mean": 0.02115095779299736, + "step": 35 + }, + { + "clip_ratio/high_max": 4.14414989791112e-06, + "clip_ratio/high_mean": 1.03603747447778e-06, + "clip_ratio/low_mean": 4.4157833031022165e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.519387027812627e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16218.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 5645.6640625, + "completions/mean_terminated_length": 5645.6640625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.0653726011514664, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003633195301517844, + "learning_rate": 1e-05, + "loss": -0.0409, + "num_tokens": 25982588.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999817967414856, + "sampling/importance_sampling_ratio/min": 0.0007106869597919285, + "sampling/sampling_logp_difference/max": 7.249278545379639, + "sampling/sampling_logp_difference/mean": 0.02010509930551052, + "step": 36 + }, + { + "clip_ratio/high_max": 7.0509927354578394e-06, + "clip_ratio/high_mean": 1.7627481838644599e-06, + "clip_ratio/low_mean": 3.606558789215342e-05, + "clip_ratio/low_min": 3.3240260108868824e-06, + "clip_ratio/region_mean": 3.782833596233104e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 7335.1875, + "completions/mean_terminated_length": 7118.01611328125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.9340982511639595, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017444937257096171, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 26946156.0, + "reward": 0.171875, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998713731765747, + "sampling/importance_sampling_ratio/min": 2.5868248485494405e-05, + "sampling/sampling_logp_difference/max": 10.562494277954102, + "sampling/sampling_logp_difference/mean": 0.01965884119272232, + "step": 37 + }, + { + "clip_ratio/high_max": 1.1849869679281255e-05, + "clip_ratio/high_mean": 2.962467419820314e-06, + "clip_ratio/low_mean": 2.5232500775018707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8194967853778508e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14151.0, + "completions/mean_length": 5998.8671875, + "completions/mean_terminated_length": 5917.09423828125, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "entropy": 0.975816160440445, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020293404813855886, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 27733059.0, + "reward": 0.2734375, + "reward_std": 0.2908889353275299, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999889612197876, + "sampling/importance_sampling_ratio/min": 0.00892679300159216, + "sampling/sampling_logp_difference/max": 4.718698024749756, + "sampling/sampling_logp_difference/mean": 0.01972467266023159, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.05586318315909e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05586318315909e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5599.4375, + "completions/mean_terminated_length": 5599.4375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 1.006210096180439, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035929102450609207, + "learning_rate": 1e-05, + "loss": 0.02, + "num_tokens": 28468843.0, + "reward": 0.2578125, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.009500927291810513, + "sampling/sampling_logp_difference/max": 4.656365871429443, + "sampling/sampling_logp_difference/mean": 0.019885972142219543, + "step": 39 + }, + { + "clip_ratio/high_max": 1.1638733667496126e-05, + "clip_ratio/high_mean": 2.9096834168740315e-06, + "clip_ratio/low_mean": 3.210125066743785e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5010934084311884e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14843.0, + "completions/max_terminated_length": 14843.0, + "completions/mean_length": 5035.7734375, + "completions/mean_terminated_length": 5035.7734375, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 1.004905492067337, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023170222993940115, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 29133270.0, + "reward": 0.3046875, + "reward_std": 0.3037971258163452, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 4.264977542334236e-05, + "sampling/sampling_logp_difference/max": 10.062488555908203, + "sampling/sampling_logp_difference/mean": 0.019529584795236588, + "step": 40 + }, + { + "clip_ratio/high_max": 9.932905413734261e-06, + "clip_ratio/high_mean": 2.4832263534335652e-06, + "clip_ratio/low_mean": 4.655256179830758e-05, + "clip_ratio/low_min": 1.288991325054667e-05, + "clip_ratio/region_mean": 4.903578792436747e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 4865.6171875, + "completions/mean_terminated_length": 4774.92138671875, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "entropy": 0.9472262933850288, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0024069426581263542, + "learning_rate": 1e-05, + "loss": 0.0435, + "num_tokens": 29774973.0, + "reward": 0.4296875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000076293945312, + "sampling/importance_sampling_ratio/min": 4.94040648391092e-07, + "sampling/sampling_logp_difference/max": 14.520648002624512, + "sampling/sampling_logp_difference/mean": 0.017961984500288963, + "step": 41 + }, + { + "clip_ratio/high_max": 1.4300524526333902e-05, + "clip_ratio/high_mean": 4.549106392914837e-06, + "clip_ratio/low_mean": 8.310655789500743e-05, + "clip_ratio/low_min": 3.895901500072796e-06, + "clip_ratio/region_mean": 8.765566417423543e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14604.0, + "completions/max_terminated_length": 14604.0, + "completions/mean_length": 5928.3828125, + "completions/mean_terminated_length": 5928.3828125, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.9451013877987862, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019357368582859635, + "learning_rate": 1e-05, + "loss": 0.0659, + "num_tokens": 30557014.0, + "reward": 0.2734375, + "reward_std": 0.3227117359638214, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000040054321289, + "sampling/importance_sampling_ratio/min": 4.787445504916832e-06, + "sampling/sampling_logp_difference/max": 12.249513626098633, + "sampling/sampling_logp_difference/mean": 0.020681140944361687, + "step": 42 + }, + { + "clip_ratio/high_max": 1.6088630218291655e-05, + "clip_ratio/high_mean": 4.022157554572914e-06, + "clip_ratio/low_mean": 4.4498895476863254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.852105257668882e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15935.0, + "completions/max_terminated_length": 15935.0, + "completions/mean_length": 5253.890625, + "completions/mean_terminated_length": 5253.890625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 1.0573822036385536, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027430339250713587, + "learning_rate": 1e-05, + "loss": -0.0295, + "num_tokens": 31252752.0, + "reward": 0.3828125, + "reward_std": 0.3564237058162689, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 0.0019366396591067314, + "sampling/sampling_logp_difference/max": 6.246800899505615, + "sampling/sampling_logp_difference/mean": 0.019426241517066956, + "step": 43 + }, + { + "clip_ratio/high_max": 1.80760021066817e-05, + "clip_ratio/high_mean": 4.519000526670425e-06, + "clip_ratio/low_mean": 2.491120585546014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9430206382130564e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12610.0, + "completions/mean_length": 4434.7890625, + "completions/mean_terminated_length": 4340.70068359375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 1.0309192687273026, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027177443262189627, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 31839885.0, + "reward": 0.359375, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999918520450592, + "sampling/importance_sampling_ratio/min": 0.0010315371910110116, + "sampling/sampling_logp_difference/max": 6.876705169677734, + "sampling/sampling_logp_difference/mean": 0.01883832737803459, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9404036808955425e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404036808955425e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14982.0, + "completions/mean_length": 6810.578125, + "completions/mean_terminated_length": 6735.19677734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 1.134837955236435, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025250029284507036, + "learning_rate": 1e-05, + "loss": -0.0016, + "num_tokens": 32734551.0, + "reward": 0.2421875, + "reward_std": 0.21436068415641785, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000019073486328, + "sampling/importance_sampling_ratio/min": 0.0014875066699460149, + "sampling/sampling_logp_difference/max": 6.510653972625732, + "sampling/sampling_logp_difference/mean": 0.02130994386970997, + "step": 45 + }, + { + "clip_ratio/high_max": 1.1104832083219662e-05, + "clip_ratio/high_mean": 2.7762080208049156e-06, + "clip_ratio/low_mean": 2.9984376055836037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.276058407664095e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16351.0, + "completions/mean_length": 6623.3359375, + "completions/mean_terminated_length": 6308.4755859375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.990560457110405, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018365891883149743, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 33600498.0, + "reward": 0.3203125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 5.727278562517313e-07, + "sampling/sampling_logp_difference/max": 14.372855186462402, + "sampling/sampling_logp_difference/mean": 0.019745903089642525, + "step": 46 + }, + { + "clip_ratio/high_max": 1.5849275314394617e-05, + "clip_ratio/high_mean": 3.962318828598654e-06, + "clip_ratio/low_mean": 2.2989276772023004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.695159548693482e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14530.0, + "completions/mean_length": 5414.046875, + "completions/mean_terminated_length": 5239.9208984375, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 1.213307112455368, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016015933360904455, + "learning_rate": 1e-05, + "loss": 0.0239, + "num_tokens": 34322776.0, + "reward": 0.2109375, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999943733215332, + "sampling/importance_sampling_ratio/min": 0.0006993028800934553, + "sampling/sampling_logp_difference/max": 7.2654266357421875, + "sampling/sampling_logp_difference/mean": 0.021634424105286598, + "step": 47 + }, + { + "clip_ratio/high_max": 3.0635404527856736e-05, + "clip_ratio/high_mean": 7.658851131964184e-06, + "clip_ratio/low_mean": 4.565159474623215e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3310446219256846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16066.0, + "completions/max_terminated_length": 16066.0, + "completions/mean_length": 6082.1015625, + "completions/mean_terminated_length": 6082.1015625, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.8880708515644073, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002024279674515128, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 35118853.0, + "reward": 0.4765625, + "reward_std": 0.3619031310081482, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 3.121717236354016e-05, + "sampling/sampling_logp_difference/max": 10.374542236328125, + "sampling/sampling_logp_difference/mean": 0.01861739531159401, + "step": 48 + }, + { + "clip_ratio/high_max": 1.718443036224926e-05, + "clip_ratio/high_mean": 4.296107590562315e-06, + "clip_ratio/low_mean": 3.4419200915181136e-05, + "clip_ratio/low_min": 3.7744964629382594e-06, + "clip_ratio/region_mean": 3.871530816468294e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16287.0, + "completions/mean_length": 6382.3828125, + "completions/mean_terminated_length": 6059.75, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "entropy": 0.8597949668765068, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002679568249732256, + "learning_rate": 1e-05, + "loss": 0.0749, + "num_tokens": 35956350.0, + "reward": 0.46875, + "reward_std": 0.39530590176582336, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000333786010742, + "sampling/importance_sampling_ratio/min": 0.0005964707233943045, + "sampling/sampling_logp_difference/max": 7.424480438232422, + "sampling/sampling_logp_difference/mean": 0.01830567792057991, + "step": 49 + }, + { + "clip_ratio/high_max": 7.470714990631677e-06, + "clip_ratio/high_mean": 1.8676787476579193e-06, + "clip_ratio/low_mean": 2.8441645326893195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0309323619803763e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 6112.7890625, + "completions/mean_terminated_length": 6112.7890625, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.9591199606657028, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0011262348853051662, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 36756171.0, + "reward": 0.359375, + "reward_std": 0.2743412256240845, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999510049819946, + "sampling/importance_sampling_ratio/min": 1.2219889867992606e-05, + "sampling/sampling_logp_difference/max": 11.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01950032450258732, + "step": 50 + }, + { + "clip_ratio/high_max": 3.7807756143592997e-06, + "clip_ratio/high_mean": 9.451939035898249e-07, + "clip_ratio/low_mean": 3.906526939090327e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.001046335133651e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16169.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 6744.390625, + "completions/mean_terminated_length": 6744.390625, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "entropy": 1.061469852924347, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002305408474057913, + "learning_rate": 1e-05, + "loss": 0.0496, + "num_tokens": 37643573.0, + "reward": 0.234375, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986124992371, + "sampling/importance_sampling_ratio/min": 9.516369573248085e-06, + "sampling/sampling_logp_difference/max": 11.56249713897705, + "sampling/sampling_logp_difference/mean": 0.020016517490148544, + "step": 51 + }, + { + "clip_ratio/high_max": 1.3845812645740807e-05, + "clip_ratio/high_mean": 3.4614531614352018e-06, + "clip_ratio/low_mean": 2.3906941066798026e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7368394228233228e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15606.0, + "completions/max_terminated_length": 15606.0, + "completions/mean_length": 5723.0859375, + "completions/mean_terminated_length": 5723.0859375, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "entropy": 1.0918374806642532, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002079444006085396, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 38399000.0, + "reward": 0.34375, + "reward_std": 0.28353503346443176, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999094009399414, + "sampling/importance_sampling_ratio/min": 0.00247886567376554, + "sampling/sampling_logp_difference/max": 5.9999542236328125, + "sampling/sampling_logp_difference/mean": 0.02025545760989189, + "step": 52 + }, + { + "clip_ratio/high_max": 1.6330426660715602e-05, + "clip_ratio/high_mean": 4.082606665178901e-06, + "clip_ratio/low_mean": 4.608668984928954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0169297423963144e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15124.0, + "completions/mean_length": 6075.078125, + "completions/mean_terminated_length": 5827.6640625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "entropy": 1.0526456609368324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002548371907323599, + "learning_rate": 1e-05, + "loss": 0.0005, + "num_tokens": 39195762.0, + "reward": 0.28125, + "reward_std": 0.2903746962547302, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.0003802210558205843, + "sampling/sampling_logp_difference/max": 7.874757766723633, + "sampling/sampling_logp_difference/mean": 0.02132822386920452, + "step": 53 + }, + { + "clip_ratio/high_max": 1.2557530681078788e-05, + "clip_ratio/high_mean": 3.139382670269697e-06, + "clip_ratio/low_mean": 5.579355536156072e-05, + "clip_ratio/low_min": 6.314919346550596e-06, + "clip_ratio/region_mean": 5.893293734970939e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14981.0, + "completions/mean_length": 6273.203125, + "completions/mean_terminated_length": 6193.59033203125, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 0.9629805982112885, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001929077785462141, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 40016988.0, + "reward": 0.3828125, + "reward_std": 0.35718512535095215, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000178813934326, + "sampling/importance_sampling_ratio/min": 0.004126251209527254, + "sampling/sampling_logp_difference/max": 5.490386009216309, + "sampling/sampling_logp_difference/mean": 0.01974763534963131, + "step": 54 + }, + { + "clip_ratio/high_max": 5.326855898601934e-06, + "clip_ratio/high_mean": 1.3317139746504836e-06, + "clip_ratio/low_mean": 1.2195182989671594e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3526897078008915e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12113.0, + "completions/mean_length": 4658.1640625, + "completions/mean_terminated_length": 4565.83447265625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.950105108320713, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002910251496359706, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 40632681.0, + "reward": 0.390625, + "reward_std": 0.28353503346443176, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000574588775635, + "sampling/importance_sampling_ratio/min": 0.0017036369536072016, + "sampling/sampling_logp_difference/max": 6.374989986419678, + "sampling/sampling_logp_difference/mean": 0.018849056214094162, + "step": 55 + }, + { + "clip_ratio/high_max": 1.1988173810095759e-05, + "clip_ratio/high_mean": 2.9970434525239398e-06, + "clip_ratio/low_mean": 2.1473538311056473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4470581195146224e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15638.0, + "completions/mean_length": 6582.953125, + "completions/mean_terminated_length": 5756.94921875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.8884479179978371, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018201791681349277, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 41498939.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.00011687594087561592, + "sampling/sampling_logp_difference/max": 9.054397583007812, + "sampling/sampling_logp_difference/mean": 0.018637457862496376, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9767679873439192e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9767679873439192e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15902.0, + "completions/mean_length": 6408.4453125, + "completions/mean_terminated_length": 6250.103515625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "entropy": 1.0724121406674385, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027558596339076757, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 42338436.0, + "reward": 0.2578125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000157356262207, + "sampling/importance_sampling_ratio/min": 2.144563404726796e-05, + "sampling/sampling_logp_difference/max": 10.74998950958252, + "sampling/sampling_logp_difference/mean": 0.020520739257335663, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.615732708160067e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.615732708160067e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 4527.8984375, + "completions/mean_terminated_length": 4243.35205078125, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 0.9734272584319115, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018782512051984668, + "learning_rate": 1e-05, + "loss": 0.0726, + "num_tokens": 42936215.0, + "reward": 0.4375, + "reward_std": 0.2890765368938446, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626874923706, + "sampling/importance_sampling_ratio/min": 6.564679324583267e-07, + "sampling/sampling_logp_difference/max": 14.2363920211792, + "sampling/sampling_logp_difference/mean": 0.018541917204856873, + "step": 58 + }, + { + "clip_ratio/high_max": 1.9634914679045323e-05, + "clip_ratio/high_mean": 4.908728669761331e-06, + "clip_ratio/low_mean": 3.605886263358116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096759084859514e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14340.0, + "completions/max_terminated_length": 14340.0, + "completions/mean_length": 5389.609375, + "completions/mean_terminated_length": 5389.609375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.035320296883583, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003410179866477847, + "learning_rate": 1e-05, + "loss": 0.1109, + "num_tokens": 43643733.0, + "reward": 0.4609375, + "reward_std": 0.3040394186973572, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303221702576, + "sampling/importance_sampling_ratio/min": 7.063792872941121e-05, + "sampling/sampling_logp_difference/max": 9.557943344116211, + "sampling/sampling_logp_difference/mean": 0.01980186253786087, + "step": 59 + }, + { + "clip_ratio/high_max": 3.324525869174977e-05, + "clip_ratio/high_mean": 9.664479989623942e-06, + "clip_ratio/low_mean": 3.5182122701371554e-05, + "clip_ratio/low_min": 1.1718383575498592e-05, + "clip_ratio/region_mean": 4.484660291836917e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 5338.90625, + "completions/mean_terminated_length": 5251.93701171875, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.9680418893694878, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0013158825458958745, + "learning_rate": 1e-05, + "loss": 0.0851, + "num_tokens": 44345177.0, + "reward": 0.4140625, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 1.941789093962143e-07, + "sampling/sampling_logp_difference/max": 15.454485893249512, + "sampling/sampling_logp_difference/mean": 0.019034607335925102, + "step": 60 + }, + { + "clip_ratio/high_max": 1.678188709774986e-05, + "clip_ratio/high_mean": 4.195471774437465e-06, + "clip_ratio/low_mean": 2.326147910025611e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.74569506473199e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15030.0, + "completions/mean_length": 5197.5859375, + "completions/mean_terminated_length": 5020.02392578125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.9385635256767273, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023898824583739042, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 45029716.0, + "reward": 0.328125, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999666213989258, + "sampling/importance_sampling_ratio/min": 0.0031843625474721193, + "sampling/sampling_logp_difference/max": 5.749503135681152, + "sampling/sampling_logp_difference/mean": 0.017856482416391373, + "step": 61 + }, + { + "clip_ratio/high_max": 2.8269179438211722e-05, + "clip_ratio/high_mean": 7.0672948595529306e-06, + "clip_ratio/low_mean": 4.551043662104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2577731821656926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6353.9375, + "completions/mean_terminated_length": 6194.73046875, + "completions/min_length": 1201.0, + "completions/min_terminated_length": 1201.0, + "entropy": 0.9195960611104965, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002777763642370701, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 45861388.0, + "reward": 0.4140625, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.00033647287636995316, + "sampling/sampling_logp_difference/max": 7.996993064880371, + "sampling/sampling_logp_difference/mean": 0.019472671672701836, + "step": 62 + }, + { + "clip_ratio/high_max": 8.376483492611442e-06, + "clip_ratio/high_mean": 2.0941208731528604e-06, + "clip_ratio/low_mean": 1.1372792755537375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3466913628690236e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 7125.265625, + "completions/mean_terminated_length": 6669.91748046875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.9209358915686607, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012601700145751238, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 46793902.0, + "reward": 0.265625, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999377727508545, + "sampling/importance_sampling_ratio/min": 2.034899989666883e-05, + "sampling/sampling_logp_difference/max": 10.802478790283203, + "sampling/sampling_logp_difference/mean": 0.0191169623285532, + "step": 63 + }, + { + "clip_ratio/high_max": 6.630596089962637e-06, + "clip_ratio/high_mean": 1.6576490224906593e-06, + "clip_ratio/low_mean": 3.7912880316071096e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.957052945224859e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14620.0, + "completions/mean_length": 5895.4453125, + "completions/mean_terminated_length": 5812.8583984375, + "completions/min_length": 708.0, + "completions/min_terminated_length": 708.0, + "entropy": 0.9421789273619652, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036641336046159267, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 47567543.0, + "reward": 0.359375, + "reward_std": 0.2937847673892975, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999691247940063, + "sampling/importance_sampling_ratio/min": 2.1912494048592634e-05, + "sampling/sampling_logp_difference/max": 10.728453636169434, + "sampling/sampling_logp_difference/mean": 0.018009435385465622, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.6876661106834945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6876661106834945e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13935.0, + "completions/mean_length": 4643.9921875, + "completions/mean_terminated_length": 4551.55126953125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 1.1234809532761574, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003017786890268326, + "learning_rate": 1e-05, + "loss": 0.0403, + "num_tokens": 48180998.0, + "reward": 0.328125, + "reward_std": 0.2198973000049591, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 1.4786172641834128e-06, + "sampling/sampling_logp_difference/max": 13.424403190612793, + "sampling/sampling_logp_difference/mean": 0.0194530226290226, + "step": 65 + }, + { + "clip_ratio/high_max": 1.1807285773102194e-05, + "clip_ratio/high_mean": 2.9518214432755485e-06, + "clip_ratio/low_mean": 1.7793156246170838e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0744977689446387e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 7134.5546875, + "completions/mean_terminated_length": 6679.66357421875, + "completions/min_length": 765.0, + "completions/min_terminated_length": 765.0, + "entropy": 1.0891609117388725, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021707366686314344, + "learning_rate": 1e-05, + "loss": 0.0079, + "num_tokens": 49113837.0, + "reward": 0.2578125, + "reward_std": 0.21778056025505066, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000441074371338, + "sampling/importance_sampling_ratio/min": 5.227705059951404e-06, + "sampling/sampling_logp_difference/max": 12.161538124084473, + "sampling/sampling_logp_difference/mean": 0.021074742078781128, + "step": 66 + }, + { + "clip_ratio/high_max": 1.785590688996308e-05, + "clip_ratio/high_mean": 4.46397672249077e-06, + "clip_ratio/low_mean": 4.4942946374249004e-05, + "clip_ratio/low_min": 4.320774223742774e-06, + "clip_ratio/region_mean": 4.940692338095687e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16178.0, + "completions/mean_length": 6770.3984375, + "completions/mean_terminated_length": 6694.70068359375, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 1.14402187615633, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003913953434675932, + "learning_rate": 1e-05, + "loss": -0.0645, + "num_tokens": 49999984.0, + "reward": 0.2890625, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999771118164062, + "sampling/importance_sampling_ratio/min": 0.00039836866199038923, + "sampling/sampling_logp_difference/max": 7.828132629394531, + "sampling/sampling_logp_difference/mean": 0.021658796817064285, + "step": 67 + }, + { + "clip_ratio/high_max": 6.990269412199268e-06, + "clip_ratio/high_mean": 3.4296645026188344e-06, + "clip_ratio/low_mean": 3.069889220341793e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.412855670603676e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 6743.3359375, + "completions/mean_terminated_length": 5926.33056640625, + "completions/min_length": 1195.0, + "completions/min_terminated_length": 1195.0, + "entropy": 0.8485476225614548, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0015872148796916008, + "learning_rate": 1e-05, + "loss": 0.0107, + "num_tokens": 50881939.0, + "reward": 0.2578125, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998895525932312, + "sampling/importance_sampling_ratio/min": 0.008705966174602509, + "sampling/sampling_logp_difference/max": 4.743746757507324, + "sampling/sampling_logp_difference/mean": 0.017901426181197166, + "step": 68 + }, + { + "clip_ratio/high_max": 1.300406438531354e-05, + "clip_ratio/high_mean": 3.251016096328385e-06, + "clip_ratio/low_mean": 3.055216484426637e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.380318116796843e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15756.0, + "completions/max_terminated_length": 15756.0, + "completions/mean_length": 5952.0234375, + "completions/mean_terminated_length": 5952.0234375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 1.1280141845345497, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037659234367311, + "learning_rate": 1e-05, + "loss": 0.1156, + "num_tokens": 51664814.0, + "reward": 0.2578125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009536743164, + "sampling/importance_sampling_ratio/min": 0.0037554434966295958, + "sampling/sampling_logp_difference/max": 5.5845489501953125, + "sampling/sampling_logp_difference/mean": 0.01998155191540718, + "step": 69 + }, + { + "clip_ratio/high_max": 9.465616585657699e-06, + "clip_ratio/high_mean": 2.3664041464144248e-06, + "clip_ratio/low_mean": 3.98842666982091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2250670958310366e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15301.0, + "completions/mean_length": 5533.171875, + "completions/mean_terminated_length": 5360.93701171875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.9313871935009956, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003427086630836129, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 52391076.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445080757141, + "sampling/importance_sampling_ratio/min": 2.0617162590497173e-05, + "sampling/sampling_logp_difference/max": 10.789386749267578, + "sampling/sampling_logp_difference/mean": 0.019165968522429466, + "step": 70 + }, + { + "clip_ratio/high_max": 1.4208102129487088e-05, + "clip_ratio/high_mean": 3.552025532371772e-06, + "clip_ratio/low_mean": 3.275496806054434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630699370660295e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16334.0, + "completions/mean_length": 7481.671875, + "completions/mean_terminated_length": 7194.5, + "completions/min_length": 1003.0, + "completions/min_terminated_length": 1003.0, + "entropy": 0.9429318532347679, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002845548093318939, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 53366314.0, + "reward": 0.34375, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999762773513794, + "sampling/importance_sampling_ratio/min": 0.00124227290507406, + "sampling/sampling_logp_difference/max": 6.690812587738037, + "sampling/sampling_logp_difference/mean": 0.019388489425182343, + "step": 71 + }, + { + "clip_ratio/high_max": 2.2517269826494157e-05, + "clip_ratio/high_mean": 5.629317456623539e-06, + "clip_ratio/low_mean": 6.0563696024473757e-05, + "clip_ratio/low_min": 6.892558758408995e-06, + "clip_ratio/region_mean": 6.61930134810973e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16230.0, + "completions/mean_length": 6112.03125, + "completions/mean_terminated_length": 5865.50439453125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.9013729467988014, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017839284846559167, + "learning_rate": 1e-05, + "loss": 0.0758, + "num_tokens": 54165910.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.0015448861522600055, + "sampling/sampling_logp_difference/max": 6.472805023193359, + "sampling/sampling_logp_difference/mean": 0.019030068069696426, + "step": 72 + }, + { + "clip_ratio/high_max": 7.458678737748414e-06, + "clip_ratio/high_mean": 1.8646696844371036e-06, + "clip_ratio/low_mean": 2.7964613764197566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.982928344863467e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15853.0, + "completions/max_terminated_length": 15853.0, + "completions/mean_length": 4590.625, + "completions/mean_terminated_length": 4590.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.8759121596813202, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035294899716973305, + "learning_rate": 1e-05, + "loss": 0.0802, + "num_tokens": 54771526.0, + "reward": 0.4375, + "reward_std": 0.41268986463546753, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.0007238102261908352, + "sampling/sampling_logp_difference/max": 7.230981349945068, + "sampling/sampling_logp_difference/mean": 0.017765047028660774, + "step": 73 + }, + { + "clip_ratio/high_max": 1.460266958019929e-05, + "clip_ratio/high_mean": 3.6506673950498225e-06, + "clip_ratio/low_mean": 3.319967777315469e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.685034562295186e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 5152.234375, + "completions/mean_terminated_length": 5063.79541015625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.8593896478414536, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003597866278141737, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 55449820.0, + "reward": 0.4453125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999961853027344, + "sampling/importance_sampling_ratio/min": 0.0005548940971493721, + "sampling/sampling_logp_difference/max": 7.49673318862915, + "sampling/sampling_logp_difference/mean": 0.018061507493257523, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.4012571227794979e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4012571227794979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16195.0, + "completions/mean_length": 6629.2734375, + "completions/mean_terminated_length": 6474.43701171875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.106893703341484, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0014848506543785334, + "learning_rate": 1e-05, + "loss": -0.0128, + "num_tokens": 56318135.0, + "reward": 0.2109375, + "reward_std": 0.190433531999588, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266862869263, + "sampling/importance_sampling_ratio/min": 1.3627897033074987e-08, + "sampling/sampling_logp_difference/max": 18.111146926879883, + "sampling/sampling_logp_difference/mean": 0.021642908453941345, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.388627917251142e-05, + "clip_ratio/low_min": 5.944737495156005e-06, + "clip_ratio/region_mean": 4.388627917251142e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14845.0, + "completions/max_terminated_length": 14845.0, + "completions/mean_length": 5802.8828125, + "completions/mean_terminated_length": 5802.8828125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9879340082406998, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003300516167655587, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 57078080.0, + "reward": 0.3125, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000663995742798, + "sampling/importance_sampling_ratio/min": 0.0010333232348784804, + "sampling/sampling_logp_difference/max": 6.874975204467773, + "sampling/sampling_logp_difference/mean": 0.01895206607878208, + "step": 76 + }, + { + "clip_ratio/high_max": 1.071953920472879e-05, + "clip_ratio/high_mean": 2.6798848011821974e-06, + "clip_ratio/low_mean": 4.836337473079766e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.104325930460618e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14713.0, + "completions/max_terminated_length": 14713.0, + "completions/mean_length": 5293.1640625, + "completions/mean_terminated_length": 5293.1640625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.9724989607930183, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002898244420066476, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 57774093.0, + "reward": 0.4296875, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0031829492654651403, + "sampling/sampling_logp_difference/max": 5.7499470710754395, + "sampling/sampling_logp_difference/mean": 0.019694382324814796, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.102629304725269e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.102629304725269e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13987.0, + "completions/mean_length": 5771.5625, + "completions/mean_terminated_length": 5340.16259765625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.9740649163722992, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002261349931359291, + "learning_rate": 1e-05, + "loss": 0.0738, + "num_tokens": 58531293.0, + "reward": 0.25, + "reward_std": 0.26120057702064514, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999984502792358, + "sampling/importance_sampling_ratio/min": 7.037367322482169e-05, + "sampling/sampling_logp_difference/max": 9.561691284179688, + "sampling/sampling_logp_difference/mean": 0.019619958475232124, + "step": 78 + }, + { + "clip_ratio/high_max": 1.241475092683686e-05, + "clip_ratio/high_mean": 3.955232841690304e-06, + "clip_ratio/low_mean": 3.313706986318721e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.709230361437221e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 6832.59375, + "completions/mean_terminated_length": 6524.48388671875, + "completions/min_length": 674.0, + "completions/min_terminated_length": 674.0, + "entropy": 0.8907959461212158, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002895365934818983, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 59425137.0, + "reward": 0.4296875, + "reward_std": 0.36797165870666504, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000522136688232, + "sampling/importance_sampling_ratio/min": 0.000623974425252527, + "sampling/sampling_logp_difference/max": 7.379401206970215, + "sampling/sampling_logp_difference/mean": 0.019336842000484467, + "step": 79 + }, + { + "clip_ratio/high_max": 1.309858976128453e-05, + "clip_ratio/high_mean": 3.2746474403211323e-06, + "clip_ratio/low_mean": 3.091655224807255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.419119957470684e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15537.0, + "completions/mean_length": 5741.3515625, + "completions/mean_terminated_length": 5572.4208984375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.9363748207688332, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003053537104278803, + "learning_rate": 1e-05, + "loss": 0.0503, + "num_tokens": 60177006.0, + "reward": 0.3828125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999263882637024, + "sampling/importance_sampling_ratio/min": 0.0009319739765487611, + "sampling/sampling_logp_difference/max": 6.978205680847168, + "sampling/sampling_logp_difference/mean": 0.01948600634932518, + "step": 80 + }, + { + "clip_ratio/high_max": 2.1969835415802663e-05, + "clip_ratio/high_mean": 7.355770890171698e-06, + "clip_ratio/low_mean": 3.6011779457112425e-05, + "clip_ratio/low_min": 4.118887773074675e-06, + "clip_ratio/region_mean": 4.336755046097096e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 6333.078125, + "completions/mean_terminated_length": 6091.8564453125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.8286701366305351, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001936097047291696, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 61007192.0, + "reward": 0.2890625, + "reward_std": 0.3135277032852173, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999134540557861, + "sampling/importance_sampling_ratio/min": 0.00018122897017747164, + "sampling/sampling_logp_difference/max": 8.61574935913086, + "sampling/sampling_logp_difference/mean": 0.017766552045941353, + "step": 81 + }, + { + "clip_ratio/high_max": 3.815369746007491e-05, + "clip_ratio/high_mean": 1.1110751302112476e-05, + "clip_ratio/low_mean": 5.337692005014105e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.448767180700088e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14444.0, + "completions/mean_length": 4467.71875, + "completions/mean_terminated_length": 4373.8896484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0210246965289116, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00340029364451766, + "learning_rate": 1e-05, + "loss": -0.0143, + "num_tokens": 61606900.0, + "reward": 0.359375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999921441078186, + "sampling/importance_sampling_ratio/min": 0.004546399228274822, + "sampling/sampling_logp_difference/max": 5.3934197425842285, + "sampling/sampling_logp_difference/mean": 0.019704686477780342, + "step": 82 + }, + { + "clip_ratio/high_max": 1.4954135622247122e-05, + "clip_ratio/high_mean": 3.7385339055617806e-06, + "clip_ratio/low_mean": 3.632040886714094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0058942545329046e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15231.0, + "completions/mean_length": 5543.71875, + "completions/mean_terminated_length": 5283.55224609375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.9587382078170776, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016365943010896444, + "learning_rate": 1e-05, + "loss": 0.0057, + "num_tokens": 62335440.0, + "reward": 0.2421875, + "reward_std": 0.2964382767677307, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062346458435, + "sampling/importance_sampling_ratio/min": 1.835696679108878e-07, + "sampling/sampling_logp_difference/max": 15.510671615600586, + "sampling/sampling_logp_difference/mean": 0.019060850143432617, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1255708386670449e-05, + "clip_ratio/high_mean": 2.813927096667612e-06, + "clip_ratio/low_mean": 1.205687783567555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4870804704969487e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15514.0, + "completions/max_terminated_length": 15514.0, + "completions/mean_length": 5553.65625, + "completions/mean_terminated_length": 5553.65625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 1.0059658586978912, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028732717037200928, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 63071644.0, + "reward": 0.3046875, + "reward_std": 0.3098035454750061, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000003457069397, + "sampling/importance_sampling_ratio/min": 0.0030927264597266912, + "sampling/sampling_logp_difference/max": 5.778702259063721, + "sampling/sampling_logp_difference/mean": 0.01885710284113884, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.669913806130353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.669913806130353e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 5576.2265625, + "completions/mean_terminated_length": 5491.1259765625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.9912052825093269, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003957705572247505, + "learning_rate": 1e-05, + "loss": 0.0033, + "num_tokens": 63804529.0, + "reward": 0.2265625, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998383522033691, + "sampling/importance_sampling_ratio/min": 0.0004883196670562029, + "sampling/sampling_logp_difference/max": 7.624540328979492, + "sampling/sampling_logp_difference/mean": 0.019657567143440247, + "step": 85 + }, + { + "clip_ratio/high_max": 7.340359388763318e-06, + "clip_ratio/high_mean": 1.8350898471908295e-06, + "clip_ratio/low_mean": 4.2495241643791815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4330331377295806e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6716.9375, + "completions/mean_terminated_length": 6484.92822265625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.974421925842762, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027452034410089254, + "learning_rate": 1e-05, + "loss": -0.0238, + "num_tokens": 64684825.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998891949653625, + "sampling/importance_sampling_ratio/min": 0.00023439532378688455, + "sampling/sampling_logp_difference/max": 8.358501434326172, + "sampling/sampling_logp_difference/mean": 0.020278966054320335, + "step": 86 + }, + { + "clip_ratio/high_max": 1.1668500064843101e-05, + "clip_ratio/high_mean": 2.9171250162107754e-06, + "clip_ratio/low_mean": 2.278766351082595e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5704788185976213e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 6033.609375, + "completions/mean_terminated_length": 5869.31787109375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.9376208484172821, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014822481898590922, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 65476055.0, + "reward": 0.28125, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359846115112, + "sampling/importance_sampling_ratio/min": 0.0031867078505456448, + "sampling/sampling_logp_difference/max": 5.748766899108887, + "sampling/sampling_logp_difference/mean": 0.0203948225826025, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2838053666873748e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2838053666873748e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15593.0, + "completions/mean_length": 6561.4453125, + "completions/mean_terminated_length": 6405.5322265625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8753902241587639, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016284709563478827, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 66335528.0, + "reward": 0.3125, + "reward_std": 0.28535234928131104, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 7.897153409430757e-06, + "sampling/sampling_logp_difference/max": 11.749008178710938, + "sampling/sampling_logp_difference/mean": 0.01995038241147995, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7495306085256743e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7495306085256743e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12974.0, + "completions/mean_length": 5322.03125, + "completions/mean_terminated_length": 5234.92919921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 0.9731436967849731, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004579639527946711, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 67036244.0, + "reward": 0.3828125, + "reward_std": 0.2714630365371704, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000991821289062, + "sampling/importance_sampling_ratio/min": 0.00016946837422437966, + "sampling/sampling_logp_difference/max": 8.682844161987305, + "sampling/sampling_logp_difference/mean": 0.017986822873353958, + "step": 89 + }, + { + "clip_ratio/high_max": 9.390067589265527e-06, + "clip_ratio/high_mean": 2.347516897316382e-06, + "clip_ratio/low_mean": 2.9141255822651146e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.148877271996753e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 5428.1484375, + "completions/mean_terminated_length": 5254.24609375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.9560057744383812, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030614053830504417, + "learning_rate": 1e-05, + "loss": 0.0677, + "num_tokens": 67751911.0, + "reward": 0.40625, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998039603233337, + "sampling/importance_sampling_ratio/min": 0.00041119891102425754, + "sampling/sampling_logp_difference/max": 7.796433448791504, + "sampling/sampling_logp_difference/mean": 0.019884781911969185, + "step": 90 + }, + { + "clip_ratio/high_max": 1.3370414308155887e-05, + "clip_ratio/high_mean": 3.3426035770389717e-06, + "clip_ratio/low_mean": 2.5133818439826427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.84764220168654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16098.0, + "completions/mean_length": 6381.9140625, + "completions/mean_terminated_length": 6303.1572265625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "entropy": 1.0577945485711098, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018679362256079912, + "learning_rate": 1e-05, + "loss": 0.0464, + "num_tokens": 68594620.0, + "reward": 0.1875, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 7.031799759715796e-05, + "sampling/sampling_logp_difference/max": 9.562482833862305, + "sampling/sampling_logp_difference/mean": 0.019965168088674545, + "step": 91 + }, + { + "clip_ratio/high_max": 5.103707280795788e-06, + "clip_ratio/high_mean": 1.275926820198947e-06, + "clip_ratio/low_mean": 4.938993617997767e-05, + "clip_ratio/low_min": 4.324361725593917e-06, + "clip_ratio/region_mean": 5.06658626591161e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14491.0, + "completions/mean_length": 5626.5703125, + "completions/mean_terminated_length": 5455.81787109375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.8880954682826996, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003913378342986107, + "learning_rate": 1e-05, + "loss": 0.078, + "num_tokens": 69335061.0, + "reward": 0.359375, + "reward_std": 0.4066115617752075, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001311302185, + "sampling/importance_sampling_ratio/min": 0.00010254964581690729, + "sampling/sampling_logp_difference/max": 9.185163497924805, + "sampling/sampling_logp_difference/mean": 0.018766846507787704, + "step": 92 + }, + { + "clip_ratio/high_max": 2.656613628460036e-05, + "clip_ratio/high_mean": 6.64153407115009e-06, + "clip_ratio/low_mean": 5.355309394872165e-05, + "clip_ratio/low_min": 6.923673481651349e-06, + "clip_ratio/region_mean": 6.019462853146251e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6252.5078125, + "completions/mean_terminated_length": 6172.732421875, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "entropy": 1.0409839749336243, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002942018210887909, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 70158806.0, + "reward": 0.3515625, + "reward_std": 0.30221226811408997, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998798370361328, + "sampling/importance_sampling_ratio/min": 0.00027446431340649724, + "sampling/sampling_logp_difference/max": 8.200689315795898, + "sampling/sampling_logp_difference/mean": 0.02092035487294197, + "step": 93 + }, + { + "clip_ratio/high_max": 1.0007204764406197e-05, + "clip_ratio/high_mean": 2.501801191101549e-06, + "clip_ratio/low_mean": 6.03029346848416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.280473587594315e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15687.0, + "completions/mean_length": 5936.171875, + "completions/mean_terminated_length": 5770.33349609375, + "completions/min_length": 614.0, + "completions/min_terminated_length": 614.0, + "entropy": 0.9782606735825539, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018363922135904431, + "learning_rate": 1e-05, + "loss": 0.0037, + "num_tokens": 70938108.0, + "reward": 0.296875, + "reward_std": 0.31824085116386414, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999080300331116, + "sampling/importance_sampling_ratio/min": 0.0001234232186106965, + "sampling/sampling_logp_difference/max": 8.99989128112793, + "sampling/sampling_logp_difference/mean": 0.02028634399175644, + "step": 94 + }, + { + "clip_ratio/high_max": 2.2271185798672377e-05, + "clip_ratio/high_mean": 5.567796449668094e-06, + "clip_ratio/low_mean": 2.026856623160711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.583636239705811e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15826.0, + "completions/mean_length": 5796.34375, + "completions/mean_terminated_length": 5712.9765625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.9343783929944038, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036520177964121103, + "learning_rate": 1e-05, + "loss": 0.0465, + "num_tokens": 71697904.0, + "reward": 0.4296875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000145435333252, + "sampling/importance_sampling_ratio/min": 0.0013267943868413568, + "sampling/sampling_logp_difference/max": 6.6249895095825195, + "sampling/sampling_logp_difference/mean": 0.01939292624592781, + "step": 95 + }, + { + "clip_ratio/high_max": 1.3236602853794466e-05, + "clip_ratio/high_mean": 5.30995015424196e-06, + "clip_ratio/low_mean": 2.4116298618537257e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.942624860224896e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16075.0, + "completions/mean_length": 5912.5078125, + "completions/mean_terminated_length": 5746.2939453125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8880549967288971, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002131880959495902, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 72472657.0, + "reward": 0.484375, + "reward_std": 0.3027363121509552, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 1.3350321736993465e-08, + "sampling/sampling_logp_difference/max": 18.131725311279297, + "sampling/sampling_logp_difference/mean": 0.019045043736696243, + "step": 96 + }, + { + "clip_ratio/high_max": 1.0632415978761856e-05, + "clip_ratio/high_mean": 2.658103994690464e-06, + "clip_ratio/low_mean": 3.596552733142744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862363143980474e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14486.0, + "completions/mean_length": 5471.203125, + "completions/mean_terminated_length": 5385.275390625, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.9127756953239441, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.0030769745353609324, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 73191403.0, + "reward": 0.5234375, + "reward_std": 0.4281895160675049, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999668598175049, + "sampling/importance_sampling_ratio/min": 1.3584097757757263e-07, + "sampling/sampling_logp_difference/max": 15.81178092956543, + "sampling/sampling_logp_difference/mean": 0.019179491326212883, + "step": 97 + }, + { + "clip_ratio/high_max": 6.134668183221947e-06, + "clip_ratio/high_mean": 1.5336670458054869e-06, + "clip_ratio/low_mean": 2.465653636818388e-05, + "clip_ratio/low_min": 3.4443801268935204e-06, + "clip_ratio/region_mean": 2.6190203413989366e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6366.5078125, + "completions/mean_terminated_length": 6207.50048828125, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.9889310300350189, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027727377600967884, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 74026484.0, + "reward": 0.328125, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998502731323242, + "sampling/importance_sampling_ratio/min": 0.00011932474444620311, + "sampling/sampling_logp_difference/max": 9.033661842346191, + "sampling/sampling_logp_difference/mean": 0.01946873590350151, + "step": 98 + }, + { + "clip_ratio/high_max": 1.3569412203651154e-05, + "clip_ratio/high_mean": 3.3923530509127886e-06, + "clip_ratio/low_mean": 2.118610348134098e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4578456645940605e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16312.0, + "completions/max_terminated_length": 16312.0, + "completions/mean_length": 4089.6015625, + "completions/mean_terminated_length": 4089.6015625, + "completions/min_length": 566.0, + "completions/min_terminated_length": 566.0, + "entropy": 0.8083604946732521, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003628374310210347, + "learning_rate": 1e-05, + "loss": -0.002, + "num_tokens": 74567833.0, + "reward": 0.484375, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944269657135, + "sampling/importance_sampling_ratio/min": 0.000612107920460403, + "sampling/sampling_logp_difference/max": 7.39860200881958, + "sampling/sampling_logp_difference/mean": 0.017995744943618774, + "step": 99 + }, + { + "clip_ratio/high_max": 1.947620376085979e-05, + "clip_ratio/high_mean": 5.989323312860506e-06, + "clip_ratio/low_mean": 2.8597964728760417e-05, + "clip_ratio/low_min": 7.570710295112804e-06, + "clip_ratio/region_mean": 3.458728804162092e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16340.0, + "completions/mean_length": 5678.7890625, + "completions/mean_terminated_length": 5508.865234375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.880424402654171, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004177773837000132, + "learning_rate": 1e-05, + "loss": 0.0595, + "num_tokens": 75314022.0, + "reward": 0.4765625, + "reward_std": 0.4105730950832367, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999686479568481, + "sampling/importance_sampling_ratio/min": 3.343528805999085e-05, + "sampling/sampling_logp_difference/max": 10.305898666381836, + "sampling/sampling_logp_difference/mean": 0.018467536196112633, + "step": 100 + }, + { + "clip_ratio/high_max": 1.4969179119361797e-05, + "clip_ratio/high_mean": 3.7422947798404493e-06, + "clip_ratio/low_mean": 5.1001184147025924e-05, + "clip_ratio/low_min": 7.801042556820903e-06, + "clip_ratio/region_mean": 5.474347858580586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5253.0234375, + "completions/mean_terminated_length": 5253.0234375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.9227524027228355, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015437579713761806, + "learning_rate": 1e-05, + "loss": 0.0445, + "num_tokens": 76005417.0, + "reward": 0.3515625, + "reward_std": 0.34586966037750244, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999125003814697, + "sampling/importance_sampling_ratio/min": 5.159151623956859e-05, + "sampling/sampling_logp_difference/max": 9.872153282165527, + "sampling/sampling_logp_difference/mean": 0.018250152468681335, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3062932339380495e-05, + "clip_ratio/high_mean": 3.265733084845124e-06, + "clip_ratio/low_mean": 3.931676133106521e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2582495325405034e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15289.0, + "completions/mean_length": 5956.921875, + "completions/mean_terminated_length": 5533.056640625, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.892315685749054, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019212538609281182, + "learning_rate": 1e-05, + "loss": 0.0688, + "num_tokens": 76787623.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054074287415, + "sampling/importance_sampling_ratio/min": 0.0012463966850191355, + "sampling/sampling_logp_difference/max": 6.687498569488525, + "sampling/sampling_logp_difference/mean": 0.018439805135130882, + "step": 102 + }, + { + "clip_ratio/high_max": 2.714365291467402e-05, + "clip_ratio/high_mean": 6.785913228668505e-06, + "clip_ratio/low_mean": 3.920890912922914e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5994822471584484e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14715.0, + "completions/mean_length": 5575.09375, + "completions/mean_terminated_length": 5315.68017578125, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 1.0225786119699478, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029739944729954004, + "learning_rate": 1e-05, + "loss": 0.0482, + "num_tokens": 77520091.0, + "reward": 0.3203125, + "reward_std": 0.29719969630241394, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 1.9004226032848237e-06, + "sampling/sampling_logp_difference/max": 13.173434257507324, + "sampling/sampling_logp_difference/mean": 0.020432481542229652, + "step": 103 + }, + { + "clip_ratio/high_max": 1.1180974752278416e-05, + "clip_ratio/high_mean": 2.795243688069604e-06, + "clip_ratio/low_mean": 5.534062506740156e-05, + "clip_ratio/low_min": 4.409326720633544e-06, + "clip_ratio/region_mean": 5.813586813019356e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16346.0, + "completions/mean_length": 7777.171875, + "completions/mean_terminated_length": 7499.5322265625, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "entropy": 0.8798429742455482, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021529686637222767, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 78538993.0, + "reward": 0.3203125, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998733401298523, + "sampling/importance_sampling_ratio/min": 2.081840648315847e-06, + "sampling/sampling_logp_difference/max": 13.082258224487305, + "sampling/sampling_logp_difference/mean": 0.019486568868160248, + "step": 104 + }, + { + "clip_ratio/high_max": 1.4091711364017101e-05, + "clip_ratio/high_mean": 3.5229278410042753e-06, + "clip_ratio/low_mean": 4.0216968045569956e-05, + "clip_ratio/low_min": 4.320475454733241e-06, + "clip_ratio/region_mean": 4.3739896682382096e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15763.0, + "completions/mean_length": 6298.4296875, + "completions/mean_terminated_length": 6219.015625, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0422330349683762, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002485725563019514, + "learning_rate": 1e-05, + "loss": 0.0674, + "num_tokens": 79365144.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999951124191284, + "sampling/importance_sampling_ratio/min": 0.0008047395385801792, + "sampling/sampling_logp_difference/max": 7.1249918937683105, + "sampling/sampling_logp_difference/mean": 0.021251153200864792, + "step": 105 + }, + { + "clip_ratio/high_max": 5.182851054996718e-06, + "clip_ratio/high_mean": 1.2957127637491794e-06, + "clip_ratio/low_mean": 1.3408006566351105e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4703719102726609e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13199.0, + "completions/max_terminated_length": 13199.0, + "completions/mean_length": 5001.8515625, + "completions/mean_terminated_length": 5001.8515625, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9210668653249741, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018336179200559855, + "learning_rate": 1e-05, + "loss": -0.0075, + "num_tokens": 80024661.0, + "reward": 0.3984375, + "reward_std": 0.2969672679901123, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0015512153040617704, + "sampling/sampling_logp_difference/max": 6.468716621398926, + "sampling/sampling_logp_difference/mean": 0.018811997026205063, + "step": 106 + }, + { + "clip_ratio/high_max": 3.179798750352347e-05, + "clip_ratio/high_mean": 7.949496875880868e-06, + "clip_ratio/low_mean": 2.5010467197716935e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.29599640735978e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 6280.1875, + "completions/mean_terminated_length": 6119.81005859375, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 1.0198880061507225, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00276190135627985, + "learning_rate": 1e-05, + "loss": 0.0474, + "num_tokens": 80845941.0, + "reward": 0.2578125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.00043450010707601905, + "sampling/sampling_logp_difference/max": 7.74131441116333, + "sampling/sampling_logp_difference/mean": 0.020783018320798874, + "step": 107 + }, + { + "clip_ratio/high_max": 1.0263617241434986e-05, + "clip_ratio/high_mean": 2.5659043103587464e-06, + "clip_ratio/low_mean": 2.2780154608881276e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.534605857817951e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14812.0, + "completions/mean_length": 5617.109375, + "completions/mean_terminated_length": 5358.7041015625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0532233864068985, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020079545211046934, + "learning_rate": 1e-05, + "loss": 0.03, + "num_tokens": 81584099.0, + "reward": 0.3515625, + "reward_std": 0.3037971258163452, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000622272491455, + "sampling/importance_sampling_ratio/min": 0.0014304202049970627, + "sampling/sampling_logp_difference/max": 6.5497870445251465, + "sampling/sampling_logp_difference/mean": 0.019330721348524094, + "step": 108 + }, + { + "clip_ratio/high_max": 3.592160510379472e-06, + "clip_ratio/high_mean": 8.98040127594868e-07, + "clip_ratio/low_mean": 2.2189478841028176e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3087518968623044e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15901.0, + "completions/mean_length": 4336.828125, + "completions/mean_terminated_length": 4241.96826171875, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.8131270706653595, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002346212510019541, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 82157581.0, + "reward": 0.59375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998981952667236, + "sampling/importance_sampling_ratio/min": 0.011126067489385605, + "sampling/sampling_logp_difference/max": 4.498464584350586, + "sampling/sampling_logp_difference/mean": 0.01748315989971161, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.621310563379666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.621310563379666e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15911.0, + "completions/mean_length": 6185.1640625, + "completions/mean_terminated_length": 6023.2783203125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.9515878483653069, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020737929735332727, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 82970866.0, + "reward": 0.296875, + "reward_std": 0.2580229640007019, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999544024467468, + "sampling/importance_sampling_ratio/min": 0.00021864472364541143, + "sampling/sampling_logp_difference/max": 8.428062438964844, + "sampling/sampling_logp_difference/mean": 0.019794369116425514, + "step": 110 + }, + { + "clip_ratio/high_max": 2.830697485478595e-05, + "clip_ratio/high_mean": 7.076743713696487e-06, + "clip_ratio/low_mean": 3.404362587389187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1120369132841006e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15649.0, + "completions/mean_length": 6042.359375, + "completions/mean_terminated_length": 5960.92919921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9405315592885017, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013609385350719094, + "learning_rate": 1e-05, + "loss": 0.0023, + "num_tokens": 83762664.0, + "reward": 0.265625, + "reward_std": 0.2937847375869751, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000874996185303, + "sampling/importance_sampling_ratio/min": 0.03007127158343792, + "sampling/sampling_logp_difference/max": 3.5041849613189697, + "sampling/sampling_logp_difference/mean": 0.02063683047890663, + "step": 111 + }, + { + "clip_ratio/high_max": 2.4490228042850504e-05, + "clip_ratio/high_mean": 7.702277343923924e-06, + "clip_ratio/low_mean": 4.2714329822501895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.04166071095824e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16288.0, + "completions/mean_length": 7036.859375, + "completions/mean_terminated_length": 6963.259765625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9034569710493088, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017795560415834188, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 84684566.0, + "reward": 0.359375, + "reward_std": 0.2977414131164551, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000296831130981, + "sampling/importance_sampling_ratio/min": 0.03753140941262245, + "sampling/sampling_logp_difference/max": 3.2825770378112793, + "sampling/sampling_logp_difference/mean": 0.019494226202368736, + "step": 112 + }, + { + "clip_ratio/high_max": 2.028518520091893e-05, + "clip_ratio/high_mean": 6.102377255956526e-06, + "clip_ratio/low_mean": 3.518054700180073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.128292380300991e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16308.0, + "completions/mean_length": 6958.6484375, + "completions/mean_terminated_length": 6413.3798828125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.9195531085133553, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027138369623571634, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 85598345.0, + "reward": 0.421875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999241828918457, + "sampling/importance_sampling_ratio/min": 0.0004585298302117735, + "sampling/sampling_logp_difference/max": 7.687485218048096, + "sampling/sampling_logp_difference/mean": 0.0201261006295681, + "step": 113 + }, + { + "clip_ratio/high_max": 7.460459528374486e-06, + "clip_ratio/high_mean": 3.464071141934255e-06, + "clip_ratio/low_mean": 3.825124849754502e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.171532009422663e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5773.890625, + "completions/mean_terminated_length": 5773.890625, + "completions/min_length": 792.0, + "completions/min_terminated_length": 792.0, + "entropy": 0.8253094777464867, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019655083306133747, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 86356403.0, + "reward": 0.390625, + "reward_std": 0.2635546922683716, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 2.981063744300627e-06, + "sampling/sampling_logp_difference/max": 12.723230361938477, + "sampling/sampling_logp_difference/mean": 0.018150178715586662, + "step": 114 + }, + { + "clip_ratio/high_max": 7.937012014735956e-06, + "clip_ratio/high_mean": 1.984253003683989e-06, + "clip_ratio/low_mean": 4.778610400535399e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9770356781664304e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15584.0, + "completions/mean_length": 5233.546875, + "completions/mean_terminated_length": 4873.8544921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8463557213544846, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0024442693684250116, + "learning_rate": 1e-05, + "loss": 0.1172, + "num_tokens": 87043681.0, + "reward": 0.375, + "reward_std": 0.3987257480621338, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265670776367, + "sampling/importance_sampling_ratio/min": 4.3303893448864983e-07, + "sampling/sampling_logp_difference/max": 14.652438163757324, + "sampling/sampling_logp_difference/mean": 0.01760055497288704, + "step": 115 + }, + { + "clip_ratio/high_max": 2.0049358681717422e-05, + "clip_ratio/high_mean": 6.392639988916926e-06, + "clip_ratio/low_mean": 2.7909350819754764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4301990581298014e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16070.0, + "completions/mean_length": 6098.5234375, + "completions/mean_terminated_length": 5851.67236328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.9961429908871651, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001763843116350472, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 87845012.0, + "reward": 0.3125, + "reward_std": 0.24329747259616852, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 0.0012967984657734632, + "sampling/sampling_logp_difference/max": 6.647856712341309, + "sampling/sampling_logp_difference/mean": 0.020430129021406174, + "step": 116 + }, + { + "clip_ratio/high_max": 6.041565939085558e-06, + "clip_ratio/high_mean": 1.5103914847713895e-06, + "clip_ratio/low_mean": 3.8537290720341844e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004768220511323e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15364.0, + "completions/mean_length": 7306.828125, + "completions/mean_terminated_length": 6937.8369140625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 1.0500907376408577, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023989977780729532, + "learning_rate": 1e-05, + "loss": 0.0383, + "num_tokens": 88799758.0, + "reward": 0.1875, + "reward_std": 0.23752352595329285, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998784065246582, + "sampling/importance_sampling_ratio/min": 0.00016530237917322665, + "sampling/sampling_logp_difference/max": 8.707734107971191, + "sampling/sampling_logp_difference/mean": 0.021274670958518982, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1037226335683954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1037226335683954e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15693.0, + "completions/mean_length": 5156.9765625, + "completions/mean_terminated_length": 4978.77001953125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 1.0691863298416138, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032527034636586905, + "learning_rate": 1e-05, + "loss": 0.1168, + "num_tokens": 89482459.0, + "reward": 0.4140625, + "reward_std": 0.3406246304512024, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999943375587463, + "sampling/importance_sampling_ratio/min": 0.00010107864363817498, + "sampling/sampling_logp_difference/max": 9.19961166381836, + "sampling/sampling_logp_difference/mean": 0.019853606820106506, + "step": 118 + }, + { + "clip_ratio/high_max": 2.2721950699633453e-05, + "clip_ratio/high_mean": 5.680487674908363e-06, + "clip_ratio/low_mean": 4.0971160615299596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6651648517581634e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6804.8125, + "completions/mean_terminated_length": 6495.80615234375, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.867309644818306, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019014904974028468, + "learning_rate": 1e-05, + "loss": 0.0593, + "num_tokens": 90372587.0, + "reward": 0.375, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 0.00012341687397565693, + "sampling/sampling_logp_difference/max": 8.999942779541016, + "sampling/sampling_logp_difference/mean": 0.018908457830548286, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0602929251035675e-05, + "clip_ratio/high_mean": 2.650732312758919e-06, + "clip_ratio/low_mean": 4.483750217332272e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.748823448608164e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15419.0, + "completions/max_terminated_length": 15419.0, + "completions/mean_length": 5354.2890625, + "completions/mean_terminated_length": 5354.2890625, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9092740416526794, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028308529872447252, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 91080912.0, + "reward": 0.3359375, + "reward_std": 0.34245961904525757, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000000238418579, + "sampling/importance_sampling_ratio/min": 0.003619713708758354, + "sampling/sampling_logp_difference/max": 5.6213603019714355, + "sampling/sampling_logp_difference/mean": 0.018408317118883133, + "step": 120 + }, + { + "clip_ratio/high_max": 7.076040446918341e-06, + "clip_ratio/high_mean": 1.7690101117295853e-06, + "clip_ratio/low_mean": 6.420628960768227e-05, + "clip_ratio/low_min": 9.37260915634397e-06, + "clip_ratio/region_mean": 6.59752995488816e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7653.1328125, + "completions/mean_terminated_length": 7371.49169921875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.9067098647356033, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026082738768309355, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 92080441.0, + "reward": 0.3125, + "reward_std": 0.3395638167858124, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999957084655762, + "sampling/importance_sampling_ratio/min": 3.7638976209564134e-05, + "sampling/sampling_logp_difference/max": 10.187470436096191, + "sampling/sampling_logp_difference/mean": 0.019849080592393875, + "step": 121 + }, + { + "clip_ratio/high_max": 4.642525709641632e-06, + "clip_ratio/high_mean": 1.8333832940697903e-06, + "clip_ratio/low_mean": 4.188668265214801e-05, + "clip_ratio/low_min": 6.032381861587055e-06, + "clip_ratio/region_mean": 4.3720065264096775e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 7864.796875, + "completions/mean_terminated_length": 7220.48779296875, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.0423363894224167, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001708728028461337, + "learning_rate": 1e-05, + "loss": 0.0394, + "num_tokens": 93107607.0, + "reward": 0.2265625, + "reward_std": 0.23933593928813934, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999992311000824, + "sampling/importance_sampling_ratio/min": 4.743846602650592e-06, + "sampling/sampling_logp_difference/max": 12.258662223815918, + "sampling/sampling_logp_difference/mean": 0.02070365846157074, + "step": 122 + }, + { + "clip_ratio/high_max": 6.424297680496238e-06, + "clip_ratio/high_mean": 1.6060744201240595e-06, + "clip_ratio/low_mean": 4.487338674152852e-05, + "clip_ratio/low_min": 7.803849257470574e-06, + "clip_ratio/region_mean": 4.647946116165258e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7690.6328125, + "completions/mean_terminated_length": 7622.18115234375, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 1.061365969479084, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026824623346328735, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 94111296.0, + "reward": 0.2890625, + "reward_std": 0.2556639611721039, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998635649681091, + "sampling/importance_sampling_ratio/min": 0.00014029098383616656, + "sampling/sampling_logp_difference/max": 8.87179183959961, + "sampling/sampling_logp_difference/mean": 0.021192047744989395, + "step": 123 + }, + { + "clip_ratio/high_max": 5.478851562656928e-06, + "clip_ratio/high_mean": 1.369712890664232e-06, + "clip_ratio/low_mean": 1.5870192100919667e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.72399049915839e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15693.0, + "completions/mean_length": 5871.2265625, + "completions/mean_terminated_length": 5618.92041015625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 1.0346312001347542, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0012895551044493914, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 94883061.0, + "reward": 0.3125, + "reward_std": 0.16675156354904175, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999569654464722, + "sampling/importance_sampling_ratio/min": 0.007269685622304678, + "sampling/sampling_logp_difference/max": 4.924042224884033, + "sampling/sampling_logp_difference/mean": 0.02043779566884041, + "step": 124 + }, + { + "clip_ratio/high_max": 9.75199873209931e-06, + "clip_ratio/high_mean": 3.4236486499139573e-06, + "clip_ratio/low_mean": 3.807359871643712e-05, + "clip_ratio/low_min": 6.6283109845244326e-06, + "clip_ratio/region_mean": 4.1497247366351075e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15671.0, + "completions/mean_length": 7205.0703125, + "completions/mean_terminated_length": 6908.9755859375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.8426484614610672, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024157650768756866, + "learning_rate": 1e-05, + "loss": 0.0334, + "num_tokens": 95831798.0, + "reward": 0.3671875, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579191207886, + "sampling/importance_sampling_ratio/min": 0.00780851487070322, + "sampling/sampling_logp_difference/max": 4.852540493011475, + "sampling/sampling_logp_difference/mean": 0.01930900476872921, + "step": 125 + }, + { + "clip_ratio/high_max": 7.827117542547057e-06, + "clip_ratio/high_mean": 1.9567793856367643e-06, + "clip_ratio/low_mean": 2.85506193904439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0507398662393825e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15557.0, + "completions/mean_length": 6770.2578125, + "completions/mean_terminated_length": 6539.5283203125, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 0.8648517951369286, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018663652008399367, + "learning_rate": 1e-05, + "loss": 0.0353, + "num_tokens": 96716079.0, + "reward": 0.3671875, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147057533264, + "sampling/importance_sampling_ratio/min": 0.0013688995968550444, + "sampling/sampling_logp_difference/max": 6.593748092651367, + "sampling/sampling_logp_difference/mean": 0.019091933965682983, + "step": 126 + }, + { + "clip_ratio/high_max": 8.396982593694702e-06, + "clip_ratio/high_mean": 2.0992456484236754e-06, + "clip_ratio/low_mean": 3.30035152273922e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5102760875815875e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16138.0, + "completions/mean_length": 7880.8359375, + "completions/mean_terminated_length": 7745.86572265625, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "entropy": 0.9396157637238503, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016418134327977896, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 97744506.0, + "reward": 0.2109375, + "reward_std": 0.22225633263587952, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507069587708, + "sampling/importance_sampling_ratio/min": 0.0072977589443326, + "sampling/sampling_logp_difference/max": 4.920187950134277, + "sampling/sampling_logp_difference/mean": 0.02041018195450306, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.872459816671835e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.872459816671835e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 6425.3515625, + "completions/mean_terminated_length": 6267.2783203125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.9397681280970573, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002677743323147297, + "learning_rate": 1e-05, + "loss": 0.0076, + "num_tokens": 98587647.0, + "reward": 0.359375, + "reward_std": 0.2567248046398163, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 5.40250198355352e-07, + "sampling/sampling_logp_difference/max": 14.431233406066895, + "sampling/sampling_logp_difference/mean": 0.020279735326766968, + "step": 128 + }, + { + "clip_ratio/high_max": 1.306506624132453e-05, + "clip_ratio/high_mean": 3.2662665603311325e-06, + "clip_ratio/low_mean": 3.8350387626451266e-05, + "clip_ratio/low_min": 9.45358260651119e-06, + "clip_ratio/region_mean": 4.161665401625214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 7129.4609375, + "completions/mean_terminated_length": 6907.3525390625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 1.1336064785718918, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032464349642395973, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 99522458.0, + "reward": 0.3046875, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999245405197144, + "sampling/importance_sampling_ratio/min": 0.0046671414747834206, + "sampling/sampling_logp_difference/max": 5.367208480834961, + "sampling/sampling_logp_difference/mean": 0.021748989820480347, + "step": 129 + }, + { + "clip_ratio/high_max": 9.463296464673476e-06, + "clip_ratio/high_mean": 2.365824116168369e-06, + "clip_ratio/low_mean": 3.497452934198009e-05, + "clip_ratio/low_min": 6.806807050452335e-06, + "clip_ratio/region_mean": 3.734035340130504e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 7264.7421875, + "completions/mean_terminated_length": 7119.99267578125, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.8998278677463531, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026182979345321655, + "learning_rate": 1e-05, + "loss": 0.1161, + "num_tokens": 100474137.0, + "reward": 0.46875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000280141830444, + "sampling/importance_sampling_ratio/min": 0.021124430000782013, + "sampling/sampling_logp_difference/max": 3.8573250770568848, + "sampling/sampling_logp_difference/mean": 0.019057951867580414, + "step": 130 + }, + { + "clip_ratio/high_max": 8.944165074353805e-06, + "clip_ratio/high_mean": 2.236041268588451e-06, + "clip_ratio/low_mean": 4.6521246076736134e-05, + "clip_ratio/low_min": 7.112780167517485e-06, + "clip_ratio/region_mean": 4.875728745901142e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15064.0, + "completions/mean_length": 5473.71875, + "completions/mean_terminated_length": 5387.81103515625, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.9666230976581573, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0020499166566878557, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 101191861.0, + "reward": 0.328125, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999291896820068, + "sampling/importance_sampling_ratio/min": 1.8367816210229648e-06, + "sampling/sampling_logp_difference/max": 13.20749568939209, + "sampling/sampling_logp_difference/mean": 0.019896289333701134, + "step": 131 + }, + { + "clip_ratio/high_max": 2.054391302408476e-05, + "clip_ratio/high_mean": 5.13597825602119e-06, + "clip_ratio/low_mean": 6.0949954104216886e-05, + "clip_ratio/low_min": 1.2865434428022127e-05, + "clip_ratio/region_mean": 6.608593298551568e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 6679.9765625, + "completions/mean_terminated_length": 5946.05908203125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.8775574564933777, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0024929519277065992, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 102070058.0, + "reward": 0.3671875, + "reward_std": 0.41398313641548157, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.004311627708375454, + "sampling/sampling_logp_difference/max": 5.446439743041992, + "sampling/sampling_logp_difference/mean": 0.018816513940691948, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.7019791250259004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7019791250259004e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16345.0, + "completions/mean_length": 6549.0625, + "completions/mean_terminated_length": 6313.0244140625, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 0.8732621371746063, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002134882379323244, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 102926522.0, + "reward": 0.3828125, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000419616699219, + "sampling/importance_sampling_ratio/min": 0.0017044072737917304, + "sampling/sampling_logp_difference/max": 6.374537944793701, + "sampling/sampling_logp_difference/mean": 0.019951295107603073, + "step": 133 + }, + { + "clip_ratio/high_max": 3.6268677376938285e-06, + "clip_ratio/high_mean": 9.067169344234571e-07, + "clip_ratio/low_mean": 3.5008752547582844e-05, + "clip_ratio/low_min": 3.866736733471043e-06, + "clip_ratio/region_mean": 3.591546965253656e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16306.0, + "completions/mean_length": 6011.8359375, + "completions/mean_terminated_length": 5677.25, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9975898712873459, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037468743976205587, + "learning_rate": 1e-05, + "loss": 0.0818, + "num_tokens": 103714277.0, + "reward": 0.359375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000693798065186, + "sampling/importance_sampling_ratio/min": 0.002192396903410554, + "sampling/sampling_logp_difference/max": 6.122759819030762, + "sampling/sampling_logp_difference/mean": 0.019433926790952682, + "step": 134 + }, + { + "clip_ratio/high_max": 2.6430232992424862e-05, + "clip_ratio/high_mean": 6.607558248106216e-06, + "clip_ratio/low_mean": 3.3786116432565905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0393675021732633e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15027.0, + "completions/mean_length": 6270.203125, + "completions/mean_terminated_length": 6190.56689453125, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.7808161675930023, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035926424898207188, + "learning_rate": 1e-05, + "loss": 0.1162, + "num_tokens": 104537295.0, + "reward": 0.4921875, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999791383743286, + "sampling/importance_sampling_ratio/min": 0.00840076245367527, + "sampling/sampling_logp_difference/max": 4.779432773590088, + "sampling/sampling_logp_difference/mean": 0.017456334084272385, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.908255777991144e-05, + "clip_ratio/low_min": 7.643389835720882e-06, + "clip_ratio/region_mean": 4.908255777991144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 4916.25, + "completions/mean_terminated_length": 4734.22265625, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8354851230978966, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004259355366230011, + "learning_rate": 1e-05, + "loss": 0.0879, + "num_tokens": 105184551.0, + "reward": 0.4609375, + "reward_std": 0.3656175136566162, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000231266021729, + "sampling/importance_sampling_ratio/min": 0.003178094746544957, + "sampling/sampling_logp_difference/max": 5.751473426818848, + "sampling/sampling_logp_difference/mean": 0.01745998114347458, + "step": 136 + }, + { + "clip_ratio/high_max": 6.184750873217126e-06, + "clip_ratio/high_mean": 2.3343936845776625e-06, + "clip_ratio/low_mean": 3.130356230940379e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.363795599398145e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14083.0, + "completions/mean_length": 5317.515625, + "completions/mean_terminated_length": 5230.3779296875, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.9808826446533203, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021007952746003866, + "learning_rate": 1e-05, + "loss": -0.0037, + "num_tokens": 105889289.0, + "reward": 0.4296875, + "reward_std": 0.3151204586029053, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.004087196197360754, + "sampling/sampling_logp_difference/max": 5.499896049499512, + "sampling/sampling_logp_difference/mean": 0.020308660343289375, + "step": 137 + }, + { + "clip_ratio/high_max": 6.264094281505095e-06, + "clip_ratio/high_mean": 1.5660235703762737e-06, + "clip_ratio/low_mean": 4.276942695469188e-05, + "clip_ratio/low_min": 5.777519618277438e-06, + "clip_ratio/region_mean": 4.4335450525068154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16052.0, + "completions/mean_length": 7302.3671875, + "completions/mean_terminated_length": 6776.9833984375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.8526253402233124, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001218521734699607, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 106849048.0, + "reward": 0.28125, + "reward_std": 0.22331714630126953, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999129772186279, + "sampling/importance_sampling_ratio/min": 0.010783779434859753, + "sampling/sampling_logp_difference/max": 4.529712200164795, + "sampling/sampling_logp_difference/mean": 0.019228527322411537, + "step": 138 + }, + { + "clip_ratio/high_max": 1.1513777735672193e-05, + "clip_ratio/high_mean": 2.878444433918048e-06, + "clip_ratio/low_mean": 3.477262850992702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7651072489097714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14681.0, + "completions/mean_length": 4603.46875, + "completions/mean_terminated_length": 4510.70849609375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.7025937959551811, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002826553536579013, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 107456676.0, + "reward": 0.625, + "reward_std": 0.35878273844718933, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932050704956, + "sampling/importance_sampling_ratio/min": 0.0006447202758863568, + "sampling/sampling_logp_difference/max": 7.346693992614746, + "sampling/sampling_logp_difference/mean": 0.016313642263412476, + "step": 139 + }, + { + "clip_ratio/high_max": 4.341936346463626e-06, + "clip_ratio/high_mean": 1.0854840866159066e-06, + "clip_ratio/low_mean": 4.9752483846532414e-05, + "clip_ratio/low_min": 1.0369344636274036e-05, + "clip_ratio/region_mean": 5.083796850158251e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16081.0, + "completions/mean_length": 7055.921875, + "completions/mean_terminated_length": 6755.01611328125, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "entropy": 0.8677415996789932, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015939075965434313, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 108380090.0, + "reward": 0.359375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.007212483324110508, + "sampling/sampling_logp_difference/max": 4.931941986083984, + "sampling/sampling_logp_difference/mean": 0.019018646329641342, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.017062949264073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.017062949264073e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15374.0, + "completions/mean_length": 6947.546875, + "completions/mean_terminated_length": 6563.951171875, + "completions/min_length": 578.0, + "completions/min_terminated_length": 578.0, + "entropy": 0.9537070691585541, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014140130952000618, + "learning_rate": 1e-05, + "loss": 0.0685, + "num_tokens": 109288008.0, + "reward": 0.28125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532103538513, + "sampling/importance_sampling_ratio/min": 0.002557439962401986, + "sampling/sampling_logp_difference/max": 5.968748569488525, + "sampling/sampling_logp_difference/mean": 0.02024715766310692, + "step": 141 + }, + { + "clip_ratio/high_max": 1.4431375348067377e-05, + "clip_ratio/high_mean": 3.607843837016844e-06, + "clip_ratio/low_mean": 2.80186426380169e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.162648749821528e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16053.0, + "completions/mean_length": 5742.4140625, + "completions/mean_terminated_length": 5658.6220703125, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 0.8954835087060928, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012910671066492796, + "learning_rate": 1e-05, + "loss": 0.0939, + "num_tokens": 110041333.0, + "reward": 0.4375, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 2.282886634930037e-05, + "sampling/sampling_logp_difference/max": 10.687484741210938, + "sampling/sampling_logp_difference/mean": 0.017754144966602325, + "step": 142 + }, + { + "clip_ratio/high_max": 3.2560687031946145e-05, + "clip_ratio/high_mean": 9.421434697287623e-06, + "clip_ratio/low_mean": 2.801389479145655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7435329431900755e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14702.0, + "completions/max_terminated_length": 14702.0, + "completions/mean_length": 5582.1640625, + "completions/mean_terminated_length": 5582.1640625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.9963158369064331, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002162793418392539, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 110775762.0, + "reward": 0.3359375, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999851584434509, + "sampling/importance_sampling_ratio/min": 0.0010016339365392923, + "sampling/sampling_logp_difference/max": 6.90612268447876, + "sampling/sampling_logp_difference/mean": 0.020483866333961487, + "step": 143 + }, + { + "clip_ratio/high_max": 1.746983889461262e-05, + "clip_ratio/high_mean": 7.333224402827909e-06, + "clip_ratio/low_mean": 3.6373660350363934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3706885207939195e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13832.0, + "completions/mean_length": 6047.8984375, + "completions/mean_terminated_length": 5883.83349609375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.913147509098053, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00287337857298553, + "learning_rate": 1e-05, + "loss": 0.045, + "num_tokens": 111568589.0, + "reward": 0.4453125, + "reward_std": 0.3453328609466553, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 9.964095625036862e-06, + "sampling/sampling_logp_difference/max": 11.516522407531738, + "sampling/sampling_logp_difference/mean": 0.018301380798220634, + "step": 144 + }, + { + "clip_ratio/high_max": 2.6439459361426998e-05, + "clip_ratio/high_mean": 6.6098648403567495e-06, + "clip_ratio/low_mean": 4.587054809235269e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.248041247796209e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 6462.28125, + "completions/mean_terminated_length": 6224.16015625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 1.1468544080853462, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017887315480038524, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 112414673.0, + "reward": 0.2734375, + "reward_std": 0.23592589795589447, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.0007102306117303669, + "sampling/sampling_logp_difference/max": 7.249920845031738, + "sampling/sampling_logp_difference/mean": 0.021768372505903244, + "step": 145 + }, + { + "clip_ratio/high_max": 1.6320968370564515e-05, + "clip_ratio/high_mean": 5.031390969634231e-06, + "clip_ratio/low_mean": 3.567474152532668e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0706131812839885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16126.0, + "completions/mean_length": 6897.0078125, + "completions/mean_terminated_length": 6822.30712890625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.9793258458375931, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022704254370182753, + "learning_rate": 1e-05, + "loss": 0.0423, + "num_tokens": 113321722.0, + "reward": 0.2890625, + "reward_std": 0.34297874569892883, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000591278076172, + "sampling/importance_sampling_ratio/min": 5.476621663547121e-05, + "sampling/sampling_logp_difference/max": 9.812437057495117, + "sampling/sampling_logp_difference/mean": 0.020364979282021523, + "step": 146 + }, + { + "clip_ratio/high_max": 8.64622779772617e-06, + "clip_ratio/high_mean": 2.1615569494315423e-06, + "clip_ratio/low_mean": 4.702959677160834e-05, + "clip_ratio/low_min": 6.21032540948363e-06, + "clip_ratio/region_mean": 4.9191153607353044e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15914.0, + "completions/mean_length": 6779.7421875, + "completions/mean_terminated_length": 6307.4013671875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9858463555574417, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022105660755187273, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 114210841.0, + "reward": 0.390625, + "reward_std": 0.3676722049713135, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 4.2232295527355745e-06, + "sampling/sampling_logp_difference/max": 12.374910354614258, + "sampling/sampling_logp_difference/mean": 0.021493885666131973, + "step": 147 + }, + { + "clip_ratio/high_max": 9.080286417884054e-06, + "clip_ratio/high_mean": 2.2700716044710134e-06, + "clip_ratio/low_mean": 3.73501702597423e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9620241750526475e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15145.0, + "completions/mean_length": 6204.34375, + "completions/mean_terminated_length": 5960.0322265625, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.9073990881443024, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021019333507865667, + "learning_rate": 1e-05, + "loss": 0.0985, + "num_tokens": 115023469.0, + "reward": 0.4375, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999035596847534, + "sampling/importance_sampling_ratio/min": 7.850129009057127e-07, + "sampling/sampling_logp_difference/max": 14.057565689086914, + "sampling/sampling_logp_difference/mean": 0.019073951989412308, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.07747756223398e-05, + "clip_ratio/low_min": 6.719346401951043e-06, + "clip_ratio/region_mean": 7.07747756223398e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14484.0, + "completions/mean_length": 6382.890625, + "completions/mean_terminated_length": 5891.0322265625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.8928572610020638, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002703179605305195, + "learning_rate": 1e-05, + "loss": 0.1215, + "num_tokens": 115860183.0, + "reward": 0.46875, + "reward_std": 0.3924228549003601, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999281764030457, + "sampling/importance_sampling_ratio/min": 0.002329134149476886, + "sampling/sampling_logp_difference/max": 6.062258720397949, + "sampling/sampling_logp_difference/mean": 0.018461842089891434, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.991344158293941e-05, + "clip_ratio/low_min": 4.287576302886009e-06, + "clip_ratio/region_mean": 3.991344158293941e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 6856.25, + "completions/mean_terminated_length": 6387.671875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.9867237955331802, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025988349225372076, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 116757023.0, + "reward": 0.34375, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999343156814575, + "sampling/importance_sampling_ratio/min": 2.9312623155419715e-05, + "sampling/sampling_logp_difference/max": 10.437492370605469, + "sampling/sampling_logp_difference/mean": 0.019526638090610504, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.096957769661458e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096957769661458e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15873.0, + "completions/mean_length": 6312.1328125, + "completions/mean_terminated_length": 5816.794921875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.8896873891353607, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036364132538437843, + "learning_rate": 1e-05, + "loss": 0.0579, + "num_tokens": 117584064.0, + "reward": 0.2578125, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998835325241089, + "sampling/importance_sampling_ratio/min": 0.0009706970304250717, + "sampling/sampling_logp_difference/max": 6.937496185302734, + "sampling/sampling_logp_difference/mean": 0.019127443432807922, + "step": 151 + }, + { + "clip_ratio/high_max": 3.0199071261449717e-06, + "clip_ratio/high_mean": 7.549767815362429e-07, + "clip_ratio/low_mean": 4.133729697741728e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.20922739863272e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16279.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 5875.625, + "completions/mean_terminated_length": 5875.625, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9082999676465988, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025688125751912594, + "learning_rate": 1e-05, + "loss": 0.0737, + "num_tokens": 118354672.0, + "reward": 0.453125, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999657273292542, + "sampling/importance_sampling_ratio/min": 0.0024201429914683104, + "sampling/sampling_logp_difference/max": 6.023928642272949, + "sampling/sampling_logp_difference/mean": 0.019491348415613174, + "step": 152 + }, + { + "clip_ratio/high_max": 5.6563644648122136e-06, + "clip_ratio/high_mean": 1.4140911162030534e-06, + "clip_ratio/low_mean": 4.235651454109757e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.377060565730062e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13490.0, + "completions/mean_length": 6524.6015625, + "completions/mean_terminated_length": 6123.81298828125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.9052172750234604, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026063446421176195, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 119210997.0, + "reward": 0.2109375, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999611377716064, + "sampling/importance_sampling_ratio/min": 8.774310117587447e-06, + "sampling/sampling_logp_difference/max": 11.643682479858398, + "sampling/sampling_logp_difference/mean": 0.019871948286890984, + "step": 153 + }, + { + "clip_ratio/high_max": 2.8274008855078137e-05, + "clip_ratio/high_mean": 7.068502213769534e-06, + "clip_ratio/low_mean": 5.824237177876057e-05, + "clip_ratio/low_min": 9.362729997519637e-06, + "clip_ratio/region_mean": 6.531087387884327e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14731.0, + "completions/mean_length": 6606.34375, + "completions/mean_terminated_length": 6208.8779296875, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "entropy": 0.923908606171608, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002111563691869378, + "learning_rate": 1e-05, + "loss": 0.0834, + "num_tokens": 120076777.0, + "reward": 0.3359375, + "reward_std": 0.32879000902175903, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999362230300903, + "sampling/importance_sampling_ratio/min": 7.220578579492098e-10, + "sampling/sampling_logp_difference/max": 21.04891586303711, + "sampling/sampling_logp_difference/mean": 0.01944371685385704, + "step": 154 + }, + { + "clip_ratio/high_max": 2.226728611276485e-05, + "clip_ratio/high_mean": 6.534373824251816e-06, + "clip_ratio/low_mean": 2.137331728135905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7907691105610866e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 7156.2578125, + "completions/mean_terminated_length": 6934.79248046875, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 1.0026871338486671, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002556675113737583, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 121013298.0, + "reward": 0.2890625, + "reward_std": 0.26013973355293274, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322295188904, + "sampling/importance_sampling_ratio/min": 1.3007297638978343e-05, + "sampling/sampling_logp_difference/max": 11.25, + "sampling/sampling_logp_difference/mean": 0.02018606849014759, + "step": 155 + }, + { + "clip_ratio/high_max": 9.798196060728515e-06, + "clip_ratio/high_mean": 2.4495490151821286e-06, + "clip_ratio/low_mean": 6.042695122232544e-05, + "clip_ratio/low_min": 1.0388962436991278e-05, + "clip_ratio/region_mean": 6.287649966907338e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15184.0, + "completions/mean_length": 6177.3828125, + "completions/mean_terminated_length": 5848.13671875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.7995355725288391, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0032885256223380566, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 121820851.0, + "reward": 0.4609375, + "reward_std": 0.35141900181770325, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.3007570487388875e-05, + "sampling/sampling_logp_difference/max": 11.249979019165039, + "sampling/sampling_logp_difference/mean": 0.018013037741184235, + "step": 156 + }, + { + "clip_ratio/high_max": 1.836798173826537e-05, + "clip_ratio/high_mean": 4.591995434566343e-06, + "clip_ratio/low_mean": 5.0241384542459855e-05, + "clip_ratio/low_min": 7.033341489659506e-06, + "clip_ratio/region_mean": 5.483338100020774e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15941.0, + "completions/mean_length": 6033.359375, + "completions/mean_terminated_length": 5612.6015625, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.8770530596375465, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0035782051272690296, + "learning_rate": 1e-05, + "loss": 0.1015, + "num_tokens": 122615329.0, + "reward": 0.421875, + "reward_std": 0.3253750801086426, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000176429748535, + "sampling/importance_sampling_ratio/min": 8.344570233020931e-05, + "sampling/sampling_logp_difference/max": 9.391314506530762, + "sampling/sampling_logp_difference/mean": 0.018681444227695465, + "step": 157 + }, + { + "clip_ratio/high_max": 1.2653852763833129e-05, + "clip_ratio/high_mean": 4.80866970065108e-06, + "clip_ratio/low_mean": 3.11289915089219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.593766109588614e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14860.0, + "completions/mean_length": 8237.46875, + "completions/mean_terminated_length": 7974.67724609375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.9543669074773788, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026586023159325123, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 123688709.0, + "reward": 0.328125, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228119850159, + "sampling/importance_sampling_ratio/min": 0.00017198453133460134, + "sampling/sampling_logp_difference/max": 8.668106079101562, + "sampling/sampling_logp_difference/mean": 0.020768223330378532, + "step": 158 + }, + { + "clip_ratio/high_max": 4.32630758950836e-06, + "clip_ratio/high_mean": 1.08157689737709e-06, + "clip_ratio/low_mean": 3.721513610344118e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.829671300081827e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6649.1015625, + "completions/mean_terminated_length": 6000.10888671875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.8519875407218933, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028182135429233313, + "learning_rate": 1e-05, + "loss": 0.0528, + "num_tokens": 124557298.0, + "reward": 0.4140625, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999065399169922, + "sampling/importance_sampling_ratio/min": 6.050919910194352e-05, + "sampling/sampling_logp_difference/max": 9.712715148925781, + "sampling/sampling_logp_difference/mean": 0.019195500761270523, + "step": 159 + }, + { + "clip_ratio/high_max": 9.812353937377338e-06, + "clip_ratio/high_mean": 2.4530884843443346e-06, + "clip_ratio/low_mean": 1.864515820670931e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1098246747897065e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14946.0, + "completions/mean_length": 6262.125, + "completions/mean_terminated_length": 5587.33349609375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "entropy": 0.9227473363280296, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018271139124408364, + "learning_rate": 1e-05, + "loss": 0.0162, + "num_tokens": 125378002.0, + "reward": 0.421875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998780488967896, + "sampling/importance_sampling_ratio/min": 1.1365813179509132e-06, + "sampling/sampling_logp_difference/max": 13.687485694885254, + "sampling/sampling_logp_difference/mean": 0.018991345539689064, + "step": 160 + }, + { + "clip_ratio/high_max": 1.976754219867871e-05, + "clip_ratio/high_mean": 5.881085598957725e-06, + "clip_ratio/low_mean": 4.014476598968031e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6025852043385385e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16334.0, + "completions/mean_length": 6543.2734375, + "completions/mean_terminated_length": 6465.78759765625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 0.9931852892041206, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028531099669635296, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 126236133.0, + "reward": 0.2734375, + "reward_std": 0.3148259222507477, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000286102294922, + "sampling/importance_sampling_ratio/min": 1.9964969396824017e-05, + "sampling/sampling_logp_difference/max": 10.821531295776367, + "sampling/sampling_logp_difference/mean": 0.020335232838988304, + "step": 161 + }, + { + "clip_ratio/high_max": 2.1589371499430854e-05, + "clip_ratio/high_mean": 8.165637723323016e-06, + "clip_ratio/low_mean": 6.554757646881626e-05, + "clip_ratio/low_min": 5.570906523644226e-06, + "clip_ratio/region_mean": 7.371321362370509e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13107.0, + "completions/mean_length": 5567.2890625, + "completions/mean_terminated_length": 5482.1181640625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9842768535017967, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017482106341049075, + "learning_rate": 1e-05, + "loss": 0.0019, + "num_tokens": 126974666.0, + "reward": 0.25, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999868631362915, + "sampling/importance_sampling_ratio/min": 0.011517977342009544, + "sampling/sampling_logp_difference/max": 4.463846206665039, + "sampling/sampling_logp_difference/mean": 0.020022576674818993, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0515780559217092e-05, + "clip_ratio/high_mean": 2.628945139804273e-06, + "clip_ratio/low_mean": 5.164334470464382e-05, + "clip_ratio/low_min": 3.369817250131746e-06, + "clip_ratio/region_mean": 5.427229007182177e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14865.0, + "completions/mean_length": 7232.6328125, + "completions/mean_terminated_length": 6937.42724609375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9599866047501564, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001637064153328538, + "learning_rate": 1e-05, + "loss": 0.0918, + "num_tokens": 127921331.0, + "reward": 0.3671875, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000075101852417, + "sampling/importance_sampling_ratio/min": 0.00023060032981447875, + "sampling/sampling_logp_difference/max": 8.374824523925781, + "sampling/sampling_logp_difference/mean": 0.01991824433207512, + "step": 163 + }, + { + "clip_ratio/high_max": 1.7373587070323993e-05, + "clip_ratio/high_mean": 4.343396767580998e-06, + "clip_ratio/low_mean": 2.182850187182339e-05, + "clip_ratio/low_min": 4.473072294786107e-06, + "clip_ratio/region_mean": 2.6171898525717552e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15075.0, + "completions/max_terminated_length": 15075.0, + "completions/mean_length": 4948.546875, + "completions/mean_terminated_length": 4948.546875, + "completions/min_length": 609.0, + "completions/min_terminated_length": 609.0, + "entropy": 0.9903113394975662, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00162114470731467, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 128575785.0, + "reward": 0.4140625, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999828040599823, + "sampling/importance_sampling_ratio/min": 3.263082589910482e-06, + "sampling/sampling_logp_difference/max": 12.632838249206543, + "sampling/sampling_logp_difference/mean": 0.019144343212246895, + "step": 164 + }, + { + "clip_ratio/high_max": 1.2063027497788426e-05, + "clip_ratio/high_mean": 4.366232360553113e-06, + "clip_ratio/low_mean": 3.965049324961001e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4016725382789446e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6205.234375, + "completions/mean_terminated_length": 6125.08642578125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.9164782017469406, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021650632843375206, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 129389191.0, + "reward": 0.5078125, + "reward_std": 0.3214311897754669, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 0.0009118906455114484, + "sampling/sampling_logp_difference/max": 6.999990463256836, + "sampling/sampling_logp_difference/mean": 0.01929439604282379, + "step": 165 + }, + { + "clip_ratio/high_max": 2.6859754598262953e-05, + "clip_ratio/high_mean": 6.714938649565738e-06, + "clip_ratio/low_mean": 1.6451138890261063e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.31660775398268e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15741.0, + "completions/max_terminated_length": 15741.0, + "completions/mean_length": 4911.25, + "completions/mean_terminated_length": 4911.25, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.9057909473776817, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019606768619269133, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 130036711.0, + "reward": 0.296875, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999849796295166, + "sampling/importance_sampling_ratio/min": 0.0008691518451087177, + "sampling/sampling_logp_difference/max": 7.047992706298828, + "sampling/sampling_logp_difference/mean": 0.020085586234927177, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.847699741119868e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.847699741119868e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15152.0, + "completions/mean_length": 6222.0859375, + "completions/mean_terminated_length": 5978.2001953125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.102900318801403, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0013436009176075459, + "learning_rate": 1e-05, + "loss": 0.0116, + "num_tokens": 130854714.0, + "reward": 0.21875, + "reward_std": 0.1825428307056427, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322891235352, + "sampling/importance_sampling_ratio/min": 3.319984534755349e-05, + "sampling/sampling_logp_difference/max": 10.312965393066406, + "sampling/sampling_logp_difference/mean": 0.02261950448155403, + "step": 167 + }, + { + "clip_ratio/high_max": 1.0113483313034521e-05, + "clip_ratio/high_mean": 3.4217127904412337e-06, + "clip_ratio/low_mean": 3.916404375559068e-05, + "clip_ratio/low_min": 4.7332350732176565e-06, + "clip_ratio/region_mean": 4.258575745552662e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 6490.7734375, + "completions/mean_terminated_length": 6333.73828125, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "entropy": 0.9576810225844383, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025689650792628527, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 131703429.0, + "reward": 0.3515625, + "reward_std": 0.3385029733181, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999434947967529, + "sampling/importance_sampling_ratio/min": 0.00037599547067657113, + "sampling/sampling_logp_difference/max": 7.8859333992004395, + "sampling/sampling_logp_difference/mean": 0.01931593380868435, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.780203212500055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.780203212500055e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14897.0, + "completions/mean_length": 6957.453125, + "completions/mean_terminated_length": 6653.37060546875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.9904302433133125, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002132449997588992, + "learning_rate": 1e-05, + "loss": 0.0848, + "num_tokens": 132614583.0, + "reward": 0.34375, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999384880065918, + "sampling/importance_sampling_ratio/min": 9.969094350026353e-08, + "sampling/sampling_logp_difference/max": 16.121191024780273, + "sampling/sampling_logp_difference/mean": 0.019748074933886528, + "step": 169 + }, + { + "clip_ratio/high_max": 1.6620725091343047e-05, + "clip_ratio/high_mean": 6.429913469219173e-06, + "clip_ratio/low_mean": 6.847188262781856e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.49017954149167e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 6781.3828125, + "completions/mean_terminated_length": 6391.0322265625, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.7702180370688438, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037141458597034216, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 133500672.0, + "reward": 0.4140625, + "reward_std": 0.39294689893722534, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0015879785642027855, + "sampling/sampling_logp_difference/max": 6.445293426513672, + "sampling/sampling_logp_difference/mean": 0.017618997022509575, + "step": 170 + }, + { + "clip_ratio/high_max": 8.414747526330757e-06, + "clip_ratio/high_mean": 2.1036868815826892e-06, + "clip_ratio/low_mean": 2.6748189156933222e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8851876209046168e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7167.6953125, + "completions/mean_terminated_length": 7095.1259765625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 1.0333677157759666, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021144442725926638, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 134437361.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999521970748901, + "sampling/importance_sampling_ratio/min": 0.0020202873274683952, + "sampling/sampling_logp_difference/max": 6.20451545715332, + "sampling/sampling_logp_difference/mean": 0.021626941859722137, + "step": 171 + }, + { + "clip_ratio/high_max": 7.359868050116347e-06, + "clip_ratio/high_mean": 1.8399670125290868e-06, + "clip_ratio/low_mean": 3.642534238679218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826530939932127e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15035.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "entropy": 0.8884351700544357, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025075129233300686, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 135215690.0, + "reward": 0.5078125, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000145435333252, + "sampling/importance_sampling_ratio/min": 8.12270229744172e-07, + "sampling/sampling_logp_difference/max": 14.023432731628418, + "sampling/sampling_logp_difference/mean": 0.018633443862199783, + "step": 172 + }, + { + "clip_ratio/high_max": 6.931506504770368e-06, + "clip_ratio/high_mean": 1.732876626192592e-06, + "clip_ratio/low_mean": 6.461201871843514e-05, + "clip_ratio/low_min": 9.272769602830522e-06, + "clip_ratio/region_mean": 6.634489625412243e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 7267.296875, + "completions/mean_terminated_length": 7048.49609375, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 1.072906270623207, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023191061336547136, + "learning_rate": 1e-05, + "loss": 0.1216, + "num_tokens": 136165880.0, + "reward": 0.3046875, + "reward_std": 0.3400956988334656, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999949932098389, + "sampling/importance_sampling_ratio/min": 8.937300299294293e-05, + "sampling/sampling_logp_difference/max": 9.322691917419434, + "sampling/sampling_logp_difference/mean": 0.02122514694929123, + "step": 173 + }, + { + "clip_ratio/high_max": 7.245442930070567e-06, + "clip_ratio/high_mean": 1.8113607325176417e-06, + "clip_ratio/low_mean": 5.239449455984868e-05, + "clip_ratio/low_min": 7.146442158045829e-06, + "clip_ratio/region_mean": 5.420585534920974e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16230.0, + "completions/mean_length": 7433.1640625, + "completions/mean_terminated_length": 7362.68505859375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 1.0957217290997505, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029631280340254307, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 137140413.0, + "reward": 0.265625, + "reward_std": 0.28749164938926697, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999362230300903, + "sampling/importance_sampling_ratio/min": 0.0086804935708642, + "sampling/sampling_logp_difference/max": 4.746676921844482, + "sampling/sampling_logp_difference/mean": 0.022480733692646027, + "step": 174 + }, + { + "clip_ratio/high_max": 6.239364211069187e-06, + "clip_ratio/high_mean": 1.5598410527672968e-06, + "clip_ratio/low_mean": 3.690561521807467e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.846545632768539e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15985.0, + "completions/mean_length": 7073.90625, + "completions/mean_terminated_length": 6926.12744140625, + "completions/min_length": 1398.0, + "completions/min_terminated_length": 1398.0, + "entropy": 0.9333122596144676, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.000832411227747798, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 138064537.0, + "reward": 0.3671875, + "reward_std": 0.13888052105903625, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998854994773865, + "sampling/importance_sampling_ratio/min": 0.0002638234291225672, + "sampling/sampling_logp_difference/max": 8.240230560302734, + "sampling/sampling_logp_difference/mean": 0.019753674045205116, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.8504628946611774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8504628946611774e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15719.0, + "completions/mean_length": 5680.59375, + "completions/mean_terminated_length": 5596.31494140625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 0.9720541462302208, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002570893382653594, + "learning_rate": 1e-05, + "loss": 0.0289, + "num_tokens": 138809293.0, + "reward": 0.3515625, + "reward_std": 0.3703257441520691, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 1.1064497584811761e-07, + "sampling/sampling_logp_difference/max": 16.016939163208008, + "sampling/sampling_logp_difference/mean": 0.019471734762191772, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.860648109821341e-05, + "clip_ratio/low_min": 6.799404218327254e-06, + "clip_ratio/region_mean": 3.860648109821341e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15983.0, + "completions/mean_length": 8024.34375, + "completions/mean_terminated_length": 7540.72705078125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 1.0136078596115112, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017353243893012404, + "learning_rate": 1e-05, + "loss": 0.0753, + "num_tokens": 139856281.0, + "reward": 0.3046875, + "reward_std": 0.2551271915435791, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999872446060181, + "sampling/importance_sampling_ratio/min": 0.0012184304650872946, + "sampling/sampling_logp_difference/max": 6.71019172668457, + "sampling/sampling_logp_difference/mean": 0.021411728113889694, + "step": 177 + }, + { + "clip_ratio/high_max": 2.0505477323240484e-05, + "clip_ratio/high_mean": 5.126369330810121e-06, + "clip_ratio/low_mean": 5.543978954847262e-05, + "clip_ratio/low_min": 6.273411372603732e-06, + "clip_ratio/region_mean": 6.056615916349983e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 7543.96875, + "completions/mean_terminated_length": 7032.5615234375, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "entropy": 0.9921196177601814, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019490106496959925, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 140843861.0, + "reward": 0.296875, + "reward_std": 0.34717273712158203, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728202819824, + "sampling/importance_sampling_ratio/min": 0.002482798881828785, + "sampling/sampling_logp_difference/max": 5.998368740081787, + "sampling/sampling_logp_difference/mean": 0.020561274141073227, + "step": 178 + }, + { + "clip_ratio/high_max": 2.1780562747153454e-05, + "clip_ratio/high_mean": 7.637661838089116e-06, + "clip_ratio/low_mean": 5.0004296554106986e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.76419583921961e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16320.0, + "completions/max_terminated_length": 16320.0, + "completions/mean_length": 6285.1796875, + "completions/mean_terminated_length": 6285.1796875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.8724544793367386, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027221282944083214, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 141666372.0, + "reward": 0.3984375, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271631240845, + "sampling/importance_sampling_ratio/min": 0.0001951520098373294, + "sampling/sampling_logp_difference/max": 8.541731834411621, + "sampling/sampling_logp_difference/mean": 0.01924072578549385, + "step": 179 + }, + { + "clip_ratio/high_max": 1.2773067282978445e-05, + "clip_ratio/high_mean": 3.1932668207446113e-06, + "clip_ratio/low_mean": 5.425560334515467e-05, + "clip_ratio/low_min": 8.365065696125384e-06, + "clip_ratio/region_mean": 5.744886925640458e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 7659.6796875, + "completions/mean_terminated_length": 7230.6142578125, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.9285296350717545, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016997806960716844, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 142665635.0, + "reward": 0.328125, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 1.8975185867020627e-07, + "sampling/sampling_logp_difference/max": 15.477548599243164, + "sampling/sampling_logp_difference/mean": 0.020274491980671883, + "step": 180 + }, + { + "clip_ratio/high_max": 2.486542780388845e-05, + "clip_ratio/high_mean": 6.216356950972113e-06, + "clip_ratio/low_mean": 3.3204854901214276e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9421211965873226e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14834.0, + "completions/max_terminated_length": 14834.0, + "completions/mean_length": 5331.03125, + "completions/mean_terminated_length": 5331.03125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.7720941603183746, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030591271352022886, + "learning_rate": 1e-05, + "loss": -0.0544, + "num_tokens": 143364919.0, + "reward": 0.5390625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 2.998966630585187e-09, + "sampling/sampling_logp_difference/max": 19.624998092651367, + "sampling/sampling_logp_difference/mean": 0.01690140925347805, + "step": 181 + }, + { + "clip_ratio/high_max": 1.0562233001110144e-05, + "clip_ratio/high_mean": 3.6131090155322454e-06, + "clip_ratio/low_mean": 5.028249574934307e-05, + "clip_ratio/low_min": 3.0328762932185782e-06, + "clip_ratio/region_mean": 5.3895605788056855e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15895.0, + "completions/mean_length": 7086.65625, + "completions/mean_terminated_length": 6708.71533203125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.8584504351019859, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0015365247381851077, + "learning_rate": 1e-05, + "loss": 0.0465, + "num_tokens": 144293867.0, + "reward": 0.2578125, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915791511536, + "sampling/importance_sampling_ratio/min": 0.00015850062482059002, + "sampling/sampling_logp_difference/max": 8.749752044677734, + "sampling/sampling_logp_difference/mean": 0.019430743530392647, + "step": 182 + }, + { + "clip_ratio/high_max": 6.546216354763601e-06, + "clip_ratio/high_mean": 1.6365540886909002e-06, + "clip_ratio/low_mean": 3.201156800969329e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364812232575787e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 5455.6484375, + "completions/mean_terminated_length": 5369.5986328125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "entropy": 0.8517125397920609, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003156432416290045, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 145013318.0, + "reward": 0.390625, + "reward_std": 0.25726157426834106, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.10733240842819214, + "sampling/sampling_logp_difference/max": 2.2318246364593506, + "sampling/sampling_logp_difference/mean": 0.01860412396490574, + "step": 183 + }, + { + "clip_ratio/high_max": 4.192453593532264e-05, + "clip_ratio/high_mean": 1.196126476088466e-05, + "clip_ratio/low_mean": 4.6358243707800284e-05, + "clip_ratio/low_min": 5.576871444645803e-06, + "clip_ratio/region_mean": 5.8319507388659986e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 6670.2890625, + "completions/mean_terminated_length": 6192.5654296875, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "entropy": 0.8807757273316383, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028573600575327873, + "learning_rate": 1e-05, + "loss": 0.1163, + "num_tokens": 145886291.0, + "reward": 0.46875, + "reward_std": 0.38269224762916565, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 0.0006086408975534141, + "sampling/sampling_logp_difference/max": 7.404282093048096, + "sampling/sampling_logp_difference/mean": 0.01879466325044632, + "step": 184 + }, + { + "clip_ratio/high_max": 5.954649168415926e-06, + "clip_ratio/high_mean": 1.4886622921039816e-06, + "clip_ratio/low_mean": 2.10815471746173e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.257020946672128e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12881.0, + "completions/max_terminated_length": 12881.0, + "completions/mean_length": 5849.8359375, + "completions/mean_terminated_length": 5849.8359375, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 0.879327155649662, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028504019137471914, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 146658174.0, + "reward": 0.4140625, + "reward_std": 0.2596206068992615, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999953508377075, + "sampling/importance_sampling_ratio/min": 0.0004885811940766871, + "sampling/sampling_logp_difference/max": 7.62400484085083, + "sampling/sampling_logp_difference/mean": 0.019282957538962364, + "step": 185 + }, + { + "clip_ratio/high_max": 1.0011702670453815e-05, + "clip_ratio/high_mean": 3.558776029422006e-06, + "clip_ratio/low_mean": 2.338160857107141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.694038448680658e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15885.0, + "completions/mean_length": 6376.7578125, + "completions/mean_terminated_length": 6297.96044921875, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "entropy": 1.0437361896038055, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026664668694138527, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 147494367.0, + "reward": 0.25, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999197721481323, + "sampling/importance_sampling_ratio/min": 5.43163696420379e-06, + "sampling/sampling_logp_difference/max": 12.123270034790039, + "sampling/sampling_logp_difference/mean": 0.020121946930885315, + "step": 186 + }, + { + "clip_ratio/high_max": 4.071263447258389e-06, + "clip_ratio/high_mean": 1.0178158618145972e-06, + "clip_ratio/low_mean": 5.679830292137922e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.781611889688065e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15314.0, + "completions/max_terminated_length": 15314.0, + "completions/mean_length": 6753.0390625, + "completions/mean_terminated_length": 6753.0390625, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.8704448491334915, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0013236560625955462, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 148377476.0, + "reward": 0.390625, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999928891658783, + "sampling/importance_sampling_ratio/min": 0.0005196586716920137, + "sampling/sampling_logp_difference/max": 7.562338352203369, + "sampling/sampling_logp_difference/mean": 0.019745871424674988, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1118761626203195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1118761626203195e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14670.0, + "completions/mean_length": 6334.5625, + "completions/mean_terminated_length": 6255.43310546875, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "entropy": 0.9675566852092743, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003227849490940571, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 149213140.0, + "reward": 0.265625, + "reward_std": 0.22331714630126953, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 2.0039660739712417e-06, + "sampling/sampling_logp_difference/max": 13.120382308959961, + "sampling/sampling_logp_difference/mean": 0.02062838338315487, + "step": 188 + }, + { + "clip_ratio/high_max": 2.159174937332864e-05, + "clip_ratio/high_mean": 7.343517381741549e-06, + "clip_ratio/low_mean": 2.7624131234915694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.496764873034408e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15878.0, + "completions/mean_length": 5986.3125, + "completions/mean_terminated_length": 5650.90283203125, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "entropy": 0.9257830232381821, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023177729453891516, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 149998732.0, + "reward": 0.4375, + "reward_std": 0.32589423656463623, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000395774841309, + "sampling/importance_sampling_ratio/min": 0.00015848006296437234, + "sampling/sampling_logp_difference/max": 8.749881744384766, + "sampling/sampling_logp_difference/mean": 0.018431315198540688, + "step": 189 + }, + { + "clip_ratio/high_max": 1.0338640322515857e-05, + "clip_ratio/high_mean": 2.5846600806289644e-06, + "clip_ratio/low_mean": 4.149641688400152e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.408107668041339e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15871.0, + "completions/mean_length": 7341.390625, + "completions/mean_terminated_length": 7049.693359375, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.9617493599653244, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001992360921576619, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 150958414.0, + "reward": 0.2890625, + "reward_std": 0.29119330644607544, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.0011714966967701912, + "sampling/sampling_logp_difference/max": 6.7494730949401855, + "sampling/sampling_logp_difference/mean": 0.02040865272283554, + "step": 190 + }, + { + "clip_ratio/high_max": 1.402321640853188e-05, + "clip_ratio/high_mean": 4.2662558144002105e-06, + "clip_ratio/low_mean": 4.847697437071474e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.274322995774128e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15966.0, + "completions/mean_length": 6194.53125, + "completions/mean_terminated_length": 5605.0576171875, + "completions/min_length": 1022.0, + "completions/min_terminated_length": 1022.0, + "entropy": 0.7917485684156418, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002244317904114723, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 151770450.0, + "reward": 0.46875, + "reward_std": 0.29432153701782227, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999660849571228, + "sampling/importance_sampling_ratio/min": 0.0007107177516445518, + "sampling/sampling_logp_difference/max": 7.249235153198242, + "sampling/sampling_logp_difference/mean": 0.016992967575788498, + "step": 191 + }, + { + "clip_ratio/high_max": 1.0843792097148253e-05, + "clip_ratio/high_mean": 2.710948024287063e-06, + "clip_ratio/low_mean": 5.327871485860669e-05, + "clip_ratio/low_min": 8.019090955713182e-06, + "clip_ratio/region_mean": 5.598966299658059e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15010.0, + "completions/mean_length": 6883.328125, + "completions/mean_terminated_length": 6808.51953125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.8912994414567947, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028390102088451385, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 152668740.0, + "reward": 0.3359375, + "reward_std": 0.3684907555580139, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999127388000488, + "sampling/importance_sampling_ratio/min": 0.00014138928963802755, + "sampling/sampling_logp_difference/max": 8.863993644714355, + "sampling/sampling_logp_difference/mean": 0.018673548474907875, + "step": 192 + }, + { + "clip_ratio/high_max": 1.0902768735832069e-05, + "clip_ratio/high_mean": 2.7256921839580173e-06, + "clip_ratio/low_mean": 3.64547792059966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918047127626778e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15506.0, + "completions/mean_length": 7799.5234375, + "completions/mean_terminated_length": 7227.2255859375, + "completions/min_length": 908.0, + "completions/min_terminated_length": 908.0, + "entropy": 0.81409652531147, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031472526025027037, + "learning_rate": 1e-05, + "loss": 0.0106, + "num_tokens": 153684919.0, + "reward": 0.265625, + "reward_std": 0.2924865484237671, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999836802482605, + "sampling/importance_sampling_ratio/min": 0.0033896781969815493, + "sampling/sampling_logp_difference/max": 5.687020301818848, + "sampling/sampling_logp_difference/mean": 0.020041968673467636, + "step": 193 + }, + { + "clip_ratio/high_max": 9.558767487760633e-06, + "clip_ratio/high_mean": 2.3896918719401583e-06, + "clip_ratio/low_mean": 2.064374041310657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.303343228504673e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14882.0, + "completions/max_terminated_length": 14882.0, + "completions/mean_length": 6441.78125, + "completions/mean_terminated_length": 6441.78125, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 1.0110936611890793, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0008370456052944064, + "learning_rate": 1e-05, + "loss": 0.0398, + "num_tokens": 154527195.0, + "reward": 0.3984375, + "reward_std": 0.14677615463733673, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999023079872131, + "sampling/importance_sampling_ratio/min": 0.00020978205429855734, + "sampling/sampling_logp_difference/max": 8.469441413879395, + "sampling/sampling_logp_difference/mean": 0.021425459533929825, + "step": 194 + }, + { + "clip_ratio/high_max": 4.3503982851689216e-06, + "clip_ratio/high_mean": 1.0875995712922304e-06, + "clip_ratio/low_mean": 2.6103265497567918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7190865182546986e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15901.0, + "completions/mean_length": 7140.2890625, + "completions/mean_terminated_length": 6918.4404296875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.993028812110424, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004406601656228304, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 155457592.0, + "reward": 0.296875, + "reward_std": 0.24882915616035461, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998899698257446, + "sampling/importance_sampling_ratio/min": 0.005102821160107851, + "sampling/sampling_logp_difference/max": 5.277961730957031, + "sampling/sampling_logp_difference/mean": 0.020247166976332664, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.063482140281849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.063482140281849e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15280.0, + "completions/max_terminated_length": 15280.0, + "completions/mean_length": 6220.5703125, + "completions/mean_terminated_length": 6220.5703125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.9336734637618065, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0013446965022012591, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 156277609.0, + "reward": 0.3671875, + "reward_std": 0.32089442014694214, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0036465052980929613, + "sampling/sampling_logp_difference/max": 5.613986015319824, + "sampling/sampling_logp_difference/mean": 0.018678557127714157, + "step": 196 + }, + { + "clip_ratio/high_max": 1.0170509995077737e-05, + "clip_ratio/high_mean": 2.542627498769434e-06, + "clip_ratio/low_mean": 2.2835527090592223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5378154816735332e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16143.0, + "completions/mean_length": 7230.3046875, + "completions/mean_terminated_length": 6935.02392578125, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.9315059334039688, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007178800296969712, + "learning_rate": 1e-05, + "loss": 0.0817, + "num_tokens": 157222744.0, + "reward": 0.4453125, + "reward_std": 0.17517909407615662, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.005948656238615513, + "sampling/sampling_logp_difference/max": 5.124589920043945, + "sampling/sampling_logp_difference/mean": 0.019229095429182053, + "step": 197 + }, + { + "clip_ratio/high_max": 8.961743105828646e-06, + "clip_ratio/high_mean": 2.2404357764571614e-06, + "clip_ratio/low_mean": 4.256807665115048e-05, + "clip_ratio/low_min": 4.9592349569138605e-06, + "clip_ratio/region_mean": 4.480851271182473e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15489.0, + "completions/mean_length": 7101.7890625, + "completions/mean_terminated_length": 6802.36279296875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.8410197496414185, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028408628422766924, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 158151901.0, + "reward": 0.3359375, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 1.1856438959512161e-06, + "sampling/sampling_logp_difference/max": 13.645224571228027, + "sampling/sampling_logp_difference/mean": 0.018435407429933548, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0979279042876442e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0979279042876442e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15693.0, + "completions/mean_length": 6822.109375, + "completions/mean_terminated_length": 6670.33349609375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.9384881108999252, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003448180854320526, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 159043939.0, + "reward": 0.390625, + "reward_std": 0.2906692624092102, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.0018930588848888874, + "sampling/sampling_logp_difference/max": 6.269561290740967, + "sampling/sampling_logp_difference/mean": 0.01985720731317997, + "step": 199 + }, + { + "clip_ratio/high_max": 1.87569592071668e-05, + "clip_ratio/high_mean": 5.608627873243677e-06, + "clip_ratio/low_mean": 2.393421118540573e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.954283939970992e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16130.0, + "completions/mean_length": 6969.671875, + "completions/mean_terminated_length": 6665.98388671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.8700083270668983, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002675072755664587, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 159955905.0, + "reward": 0.34375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 4.222963980282657e-06, + "sampling/sampling_logp_difference/max": 12.37497329711914, + "sampling/sampling_logp_difference/mean": 0.018493790179491043, + "step": 200 + }, + { + "clip_ratio/high_max": 1.0003448096540524e-05, + "clip_ratio/high_mean": 2.500862024135131e-06, + "clip_ratio/low_mean": 2.7816862200324977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0317724281303526e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16309.0, + "completions/mean_length": 6642.921875, + "completions/mean_terminated_length": 6409.13623046875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 1.0049321055412292, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034180639777332544, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 160825383.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999150037765503, + "sampling/importance_sampling_ratio/min": 0.000667327141854912, + "sampling/sampling_logp_difference/max": 7.312230110168457, + "sampling/sampling_logp_difference/mean": 0.020563330501317978, + "step": 201 + }, + { + "clip_ratio/high_max": 5.628348844766151e-06, + "clip_ratio/high_mean": 1.4070872111915378e-06, + "clip_ratio/low_mean": 3.0009771876393643e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1416859314958856e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15930.0, + "completions/mean_length": 6327.296875, + "completions/mean_terminated_length": 6085.9365234375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.8458633497357368, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016060187481343746, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 161653685.0, + "reward": 0.484375, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999157190322876, + "sampling/importance_sampling_ratio/min": 4.0065486246021464e-05, + "sampling/sampling_logp_difference/max": 10.124995231628418, + "sampling/sampling_logp_difference/mean": 0.018988098949193954, + "step": 202 + }, + { + "clip_ratio/high_max": 1.1031161648134002e-05, + "clip_ratio/high_mean": 2.7577904120335006e-06, + "clip_ratio/low_mean": 5.184456858842168e-05, + "clip_ratio/low_min": 3.209077931387583e-06, + "clip_ratio/region_mean": 5.460235854570783e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16008.0, + "completions/mean_length": 6871.4921875, + "completions/mean_terminated_length": 6643.1923828125, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "entropy": 0.8635450080037117, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027431908529251814, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 162555796.0, + "reward": 0.296875, + "reward_std": 0.2906692326068878, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 1.8959757653647102e-05, + "sampling/sampling_logp_difference/max": 10.873191833496094, + "sampling/sampling_logp_difference/mean": 0.019010700285434723, + "step": 203 + }, + { + "clip_ratio/high_max": 1.122018943533476e-05, + "clip_ratio/high_mean": 2.80504735883369e-06, + "clip_ratio/low_mean": 3.166110184338322e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4466149031686655e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15032.0, + "completions/mean_length": 5741.7734375, + "completions/mean_terminated_length": 5657.9765625, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.820662334561348, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021551409736275673, + "learning_rate": 1e-05, + "loss": 0.0325, + "num_tokens": 163312831.0, + "reward": 0.3828125, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999495148658752, + "sampling/importance_sampling_ratio/min": 0.00020485777349676937, + "sampling/sampling_logp_difference/max": 8.493194580078125, + "sampling/sampling_logp_difference/mean": 0.018189631402492523, + "step": 204 + }, + { + "clip_ratio/high_max": 5.249454261502251e-06, + "clip_ratio/high_mean": 2.6246168545185355e-06, + "clip_ratio/low_mean": 5.6316800055356e-05, + "clip_ratio/low_min": 6.944251708773663e-06, + "clip_ratio/region_mean": 5.894141622775351e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15114.0, + "completions/max_terminated_length": 15114.0, + "completions/mean_length": 6707.234375, + "completions/mean_terminated_length": 6707.234375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.9361380413174629, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021163856144994497, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 164189605.0, + "reward": 0.21875, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998763799667358, + "sampling/importance_sampling_ratio/min": 6.894206876495446e-07, + "sampling/sampling_logp_difference/max": 14.187414169311523, + "sampling/sampling_logp_difference/mean": 0.020120715722441673, + "step": 205 + }, + { + "clip_ratio/high_max": 1.2976960988453357e-05, + "clip_ratio/high_mean": 3.244240247113339e-06, + "clip_ratio/low_mean": 4.118970764466212e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.44339480054623e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15672.0, + "completions/mean_length": 7074.59375, + "completions/mean_terminated_length": 6774.2900390625, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "entropy": 0.9206110090017319, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003191466676071286, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 165114649.0, + "reward": 0.4296875, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999928891658783, + "sampling/importance_sampling_ratio/min": 0.0015704745892435312, + "sampling/sampling_logp_difference/max": 6.4563775062561035, + "sampling/sampling_logp_difference/mean": 0.020029421895742416, + "step": 206 + }, + { + "clip_ratio/high_max": 2.4998532580866595e-05, + "clip_ratio/high_mean": 6.947302438220504e-06, + "clip_ratio/low_mean": 4.305635661694396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.000365831620002e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15481.0, + "completions/mean_length": 6510.3984375, + "completions/mean_terminated_length": 6432.6533203125, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.9344880431890488, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002458518138155341, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 165971100.0, + "reward": 0.484375, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999246597290039, + "sampling/importance_sampling_ratio/min": 0.0011708823731169105, + "sampling/sampling_logp_difference/max": 6.749997615814209, + "sampling/sampling_logp_difference/mean": 0.02032654918730259, + "step": 207 + }, + { + "clip_ratio/high_max": 1.9761582279897993e-05, + "clip_ratio/high_mean": 4.940395569974498e-06, + "clip_ratio/low_mean": 2.598603293790802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.092642862156936e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 5363.4609375, + "completions/mean_terminated_length": 5363.4609375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8528282344341278, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020360907074064016, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 166676943.0, + "reward": 0.46875, + "reward_std": 0.3079911470413208, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0005493607022799551, + "sampling/sampling_logp_difference/max": 7.506755352020264, + "sampling/sampling_logp_difference/mean": 0.01911250874400139, + "step": 208 + }, + { + "clip_ratio/high_max": 6.622867658734322e-06, + "clip_ratio/high_mean": 1.6557169146835804e-06, + "clip_ratio/low_mean": 4.006644434184636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.172216131337336e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14735.0, + "completions/mean_length": 4550.203125, + "completions/mean_terminated_length": 4266.1923828125, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.7535714656114578, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015881177969276905, + "learning_rate": 1e-05, + "loss": 0.0952, + "num_tokens": 167278489.0, + "reward": 0.5546875, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999875009059906, + "sampling/importance_sampling_ratio/min": 7.485204696422443e-05, + "sampling/sampling_logp_difference/max": 9.49999713897705, + "sampling/sampling_logp_difference/mean": 0.016919689252972603, + "step": 209 + }, + { + "clip_ratio/high_max": 2.8397119422152173e-05, + "clip_ratio/high_mean": 7.099279855538043e-06, + "clip_ratio/low_mean": 2.2654034410152235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9753314493063954e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16087.0, + "completions/mean_length": 5080.078125, + "completions/mean_terminated_length": 4991.07080078125, + "completions/min_length": 684.0, + "completions/min_terminated_length": 684.0, + "entropy": 0.922355130314827, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021621519699692726, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 167949827.0, + "reward": 0.5546875, + "reward_std": 0.21829968690872192, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998918771743774, + "sampling/importance_sampling_ratio/min": 9.328075248049572e-05, + "sampling/sampling_logp_difference/max": 9.27989673614502, + "sampling/sampling_logp_difference/mean": 0.018358757719397545, + "step": 210 + }, + { + "clip_ratio/high_max": 1.3618362117995275e-05, + "clip_ratio/high_mean": 4.41220004177012e-06, + "clip_ratio/low_mean": 6.229132804946858e-05, + "clip_ratio/low_min": 1.1466368505352875e-05, + "clip_ratio/region_mean": 6.670352740911767e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15665.0, + "completions/max_terminated_length": 15665.0, + "completions/mean_length": 6371.9453125, + "completions/mean_terminated_length": 6371.9453125, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.8835635632276535, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003488079411908984, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 168781948.0, + "reward": 0.46875, + "reward_std": 0.4673760235309601, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999973773956299, + "sampling/importance_sampling_ratio/min": 4.154009047852014e-08, + "sampling/sampling_logp_difference/max": 16.996606826782227, + "sampling/sampling_logp_difference/mean": 0.01854466274380684, + "step": 211 + }, + { + "clip_ratio/high_max": 1.3789490822091466e-05, + "clip_ratio/high_mean": 3.4473727055228665e-06, + "clip_ratio/low_mean": 3.9819827861720114e-05, + "clip_ratio/low_min": 9.205373771692393e-06, + "clip_ratio/region_mean": 4.3267199771435116e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 7045.234375, + "completions/mean_terminated_length": 6665.609375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 0.8657141029834747, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002579214284196496, + "learning_rate": 1e-05, + "loss": 0.0787, + "num_tokens": 169704370.0, + "reward": 0.390625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999009370803833, + "sampling/importance_sampling_ratio/min": 0.00038033726741559803, + "sampling/sampling_logp_difference/max": 7.874452114105225, + "sampling/sampling_logp_difference/mean": 0.020650357007980347, + "step": 212 + }, + { + "clip_ratio/high_max": 1.0065672540804371e-05, + "clip_ratio/high_mean": 2.516418135201093e-06, + "clip_ratio/low_mean": 2.5041783715096244e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7558201850297337e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13301.0, + "completions/mean_length": 4835.1015625, + "completions/mean_terminated_length": 4744.16552734375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.8166600242257118, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015265591209754348, + "learning_rate": 1e-05, + "loss": 0.0399, + "num_tokens": 170343191.0, + "reward": 0.4765625, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999908983707428, + "sampling/importance_sampling_ratio/min": 0.0008047395385801792, + "sampling/sampling_logp_difference/max": 7.1249918937683105, + "sampling/sampling_logp_difference/mean": 0.01807256042957306, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.965024677654583e-05, + "clip_ratio/low_min": 3.7946631437080214e-06, + "clip_ratio/region_mean": 3.965024677654583e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 6042.6328125, + "completions/mean_terminated_length": 5622.251953125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.8976519927382469, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019487867830321193, + "learning_rate": 1e-05, + "loss": 0.1108, + "num_tokens": 171136048.0, + "reward": 0.3828125, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.0011446340940892696, + "sampling/sampling_logp_difference/max": 6.772670269012451, + "sampling/sampling_logp_difference/mean": 0.019680369645357132, + "step": 214 + }, + { + "clip_ratio/high_max": 5.620756382995751e-06, + "clip_ratio/high_mean": 1.4051890957489377e-06, + "clip_ratio/low_mean": 4.3911951024711016e-05, + "clip_ratio/low_min": 3.7100794543221127e-06, + "clip_ratio/region_mean": 4.531714012045995e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16298.0, + "completions/mean_length": 6418.3359375, + "completions/mean_terminated_length": 6339.8662109375, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "entropy": 0.8599612265825272, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018101281020790339, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 171976483.0, + "reward": 0.390625, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999486207962036, + "sampling/importance_sampling_ratio/min": 4.0352391806663945e-05, + "sampling/sampling_logp_difference/max": 10.117859840393066, + "sampling/sampling_logp_difference/mean": 0.01834172010421753, + "step": 215 + }, + { + "clip_ratio/high_max": 8.747987521928735e-06, + "clip_ratio/high_mean": 2.1869968804821838e-06, + "clip_ratio/low_mean": 1.736767285365204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9554669734134222e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15075.0, + "completions/mean_length": 5835.1484375, + "completions/mean_terminated_length": 5752.08642578125, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.930196188390255, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0009842904983088374, + "learning_rate": 1e-05, + "loss": 0.0174, + "num_tokens": 172743158.0, + "reward": 0.3515625, + "reward_std": 0.12863078713417053, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000227689743042, + "sampling/importance_sampling_ratio/min": 0.02929825149476528, + "sampling/sampling_logp_difference/max": 3.5302274227142334, + "sampling/sampling_logp_difference/mean": 0.020194582641124725, + "step": 216 + }, + { + "clip_ratio/high_max": 3.4560856420284836e-05, + "clip_ratio/high_mean": 1.2245807511135354e-05, + "clip_ratio/low_mean": 4.938034498991328e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.162615136418026e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15807.0, + "completions/mean_length": 4960.5234375, + "completions/mean_terminated_length": 4870.57470703125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.7726479545235634, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0032878813799470663, + "learning_rate": 1e-05, + "loss": -0.0492, + "num_tokens": 173400993.0, + "reward": 0.5, + "reward_std": 0.3924228549003601, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999999403953552, + "sampling/importance_sampling_ratio/min": 1.9806284399237484e-06, + "sampling/sampling_logp_difference/max": 13.132096290588379, + "sampling/sampling_logp_difference/mean": 0.018239401280879974, + "step": 217 + }, + { + "clip_ratio/high_max": 9.530344868835527e-06, + "clip_ratio/high_mean": 2.382586217208882e-06, + "clip_ratio/low_mean": 1.8789201192248584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1171787466300884e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15588.0, + "completions/max_terminated_length": 15588.0, + "completions/mean_length": 6778.453125, + "completions/mean_terminated_length": 6778.453125, + "completions/min_length": 709.0, + "completions/min_terminated_length": 709.0, + "entropy": 0.9891144260764122, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021506824996322393, + "learning_rate": 1e-05, + "loss": 0.0872, + "num_tokens": 174286163.0, + "reward": 0.3203125, + "reward_std": 0.23910348117351532, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002121925354, + "sampling/importance_sampling_ratio/min": 3.8179036891961005e-06, + "sampling/sampling_logp_difference/max": 12.475809097290039, + "sampling/sampling_logp_difference/mean": 0.019467821344733238, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.731942322498071e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.731942322498071e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16363.0, + "completions/mean_length": 7835.8203125, + "completions/mean_terminated_length": 7768.51171875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "entropy": 1.1394712179899216, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0019394620321691036, + "learning_rate": 1e-05, + "loss": 0.0144, + "num_tokens": 175314884.0, + "reward": 0.1171875, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999979138374329, + "sampling/importance_sampling_ratio/min": 0.0006493349210359156, + "sampling/sampling_logp_difference/max": 7.339561939239502, + "sampling/sampling_logp_difference/mean": 0.02314554899930954, + "step": 219 + }, + { + "clip_ratio/high_max": 2.6689051992434543e-05, + "clip_ratio/high_mean": 1.0311606502000359e-05, + "clip_ratio/low_mean": 4.749879690280068e-05, + "clip_ratio/low_min": 1.1613257356657414e-05, + "clip_ratio/region_mean": 5.781040522379044e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15977.0, + "completions/max_terminated_length": 15977.0, + "completions/mean_length": 6552.640625, + "completions/mean_terminated_length": 6552.640625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.9301942139863968, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029180990532040596, + "learning_rate": 1e-05, + "loss": 0.0895, + "num_tokens": 176170070.0, + "reward": 0.4921875, + "reward_std": 0.3527093529701233, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000029802322388, + "sampling/importance_sampling_ratio/min": 0.004631850868463516, + "sampling/sampling_logp_difference/max": 5.374798774719238, + "sampling/sampling_logp_difference/mean": 0.01968369632959366, + "step": 220 + }, + { + "clip_ratio/high_max": 6.5973504206340294e-06, + "clip_ratio/high_mean": 1.6493376051585074e-06, + "clip_ratio/low_mean": 3.3509465310999076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.515880302984442e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 6035.296875, + "completions/mean_terminated_length": 5953.81103515625, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "entropy": 0.9439655765891075, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013513187877833843, + "learning_rate": 1e-05, + "loss": 0.0062, + "num_tokens": 176962084.0, + "reward": 0.453125, + "reward_std": 0.23645779490470886, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 7.028038817225024e-05, + "sampling/sampling_logp_difference/max": 9.563017845153809, + "sampling/sampling_logp_difference/mean": 0.020156048238277435, + "step": 221 + }, + { + "clip_ratio/high_max": 4.21926688431995e-06, + "clip_ratio/high_mean": 1.0548167210799875e-06, + "clip_ratio/low_mean": 3.7025285053005064e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8080101546711376e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15470.0, + "completions/mean_length": 7192.4296875, + "completions/mean_terminated_length": 6895.92724609375, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 0.8545770645141602, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035121457185596228, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 177901579.0, + "reward": 0.328125, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998663663864136, + "sampling/importance_sampling_ratio/min": 0.000296071550110355, + "sampling/sampling_logp_difference/max": 8.124909400939941, + "sampling/sampling_logp_difference/mean": 0.018486706539988518, + "step": 222 + }, + { + "clip_ratio/high_max": 3.974942046625074e-06, + "clip_ratio/high_mean": 9.937355116562685e-07, + "clip_ratio/low_mean": 3.2998319056787295e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.399205434106989e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 6525.328125, + "completions/mean_terminated_length": 6124.56884765625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.8625697493553162, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002456578193232417, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 178756773.0, + "reward": 0.3984375, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999394416809082, + "sampling/importance_sampling_ratio/min": 0.0001488614798290655, + "sampling/sampling_logp_difference/max": 8.812494277954102, + "sampling/sampling_logp_difference/mean": 0.018010437488555908, + "step": 223 + }, + { + "clip_ratio/high_max": 1.2826577403757256e-05, + "clip_ratio/high_mean": 4.401672981657612e-06, + "clip_ratio/low_mean": 7.05404337395521e-05, + "clip_ratio/low_min": 1.734040552037186e-05, + "clip_ratio/region_mean": 7.494210694858339e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14933.0, + "completions/mean_length": 7227.640625, + "completions/mean_terminated_length": 6932.27392578125, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.7740364670753479, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003040029900148511, + "learning_rate": 1e-05, + "loss": 0.1685, + "num_tokens": 179700639.0, + "reward": 0.515625, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996599555015564, + "sampling/importance_sampling_ratio/min": 3.1452334496862022e-06, + "sampling/sampling_logp_difference/max": 12.669622421264648, + "sampling/sampling_logp_difference/mean": 0.018948577344417572, + "step": 224 + }, + { + "clip_ratio/high_max": 7.97244683781173e-06, + "clip_ratio/high_mean": 1.9931117094529327e-06, + "clip_ratio/low_mean": 2.7227763212067657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.922087492152059e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15639.0, + "completions/mean_length": 7019.4375, + "completions/mean_terminated_length": 6870.7939453125, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.9501559659838676, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001853659632615745, + "learning_rate": 1e-05, + "loss": 0.0498, + "num_tokens": 180615847.0, + "reward": 0.390625, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999617338180542, + "sampling/importance_sampling_ratio/min": 0.0061973449774086475, + "sampling/sampling_logp_difference/max": 5.083634376525879, + "sampling/sampling_logp_difference/mean": 0.021023310720920563, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.039616189606022e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.039616189606022e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16384.0, + "completions/mean_length": 6705.03125, + "completions/mean_terminated_length": 6229.01611328125, + "completions/min_length": 1130.0, + "completions/min_terminated_length": 1130.0, + "entropy": 0.9054799973964691, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014863376272842288, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 181493971.0, + "reward": 0.3515625, + "reward_std": 0.2396402806043625, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.0023789836559444666, + "sampling/sampling_logp_difference/max": 6.04108190536499, + "sampling/sampling_logp_difference/mean": 0.019701875746250153, + "step": 226 + }, + { + "clip_ratio/high_max": 1.4479510582532384e-05, + "clip_ratio/high_mean": 3.619877645633096e-06, + "clip_ratio/low_mean": 2.6611398709519563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0231276070935564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5421.390625, + "completions/mean_terminated_length": 5421.390625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9483538940548897, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0039733098819851875, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 182208309.0, + "reward": 0.484375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999675154685974, + "sampling/importance_sampling_ratio/min": 0.011960627511143684, + "sampling/sampling_logp_difference/max": 5.5837554931640625, + "sampling/sampling_logp_difference/mean": 0.01952577941119671, + "step": 227 + }, + { + "clip_ratio/high_max": 4.601678483595606e-06, + "clip_ratio/high_mean": 1.1504196208989015e-06, + "clip_ratio/low_mean": 4.089345225111174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2043871189889614e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6497.28125, + "completions/mean_terminated_length": 6340.349609375, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.8902791813015938, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015076796989887953, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 183058249.0, + "reward": 0.4453125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000579357147217, + "sampling/importance_sampling_ratio/min": 0.011128061451017857, + "sampling/sampling_logp_difference/max": 4.498285293579102, + "sampling/sampling_logp_difference/mean": 0.019255032762885094, + "step": 228 + }, + { + "clip_ratio/high_max": 5.255413270788267e-06, + "clip_ratio/high_mean": 1.3138533176970668e-06, + "clip_ratio/low_mean": 3.985653711424675e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1170390431943815e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14710.0, + "completions/max_terminated_length": 14710.0, + "completions/mean_length": 4411.4453125, + "completions/mean_terminated_length": 4411.4453125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 1.104304239153862, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002237006789073348, + "learning_rate": 1e-05, + "loss": 0.1124, + "num_tokens": 183645026.0, + "reward": 0.3203125, + "reward_std": 0.22461041808128357, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000056028366089, + "sampling/importance_sampling_ratio/min": 4.804155082638317e-07, + "sampling/sampling_logp_difference/max": 14.548614501953125, + "sampling/sampling_logp_difference/mean": 0.020417846739292145, + "step": 229 + }, + { + "clip_ratio/high_max": 4.956973498337902e-06, + "clip_ratio/high_mean": 1.2392433745844755e-06, + "clip_ratio/low_mean": 4.839278165036376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9632024911261396e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15486.0, + "completions/mean_length": 5763.3828125, + "completions/mean_terminated_length": 5508.48828125, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 0.7673545032739639, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0027243588119745255, + "learning_rate": 1e-05, + "loss": 0.0747, + "num_tokens": 184402387.0, + "reward": 0.4375, + "reward_std": 0.3661494255065918, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999270439147949, + "sampling/importance_sampling_ratio/min": 0.0008851620368659496, + "sampling/sampling_logp_difference/max": 7.029739856719971, + "sampling/sampling_logp_difference/mean": 0.01735807955265045, + "step": 230 + }, + { + "clip_ratio/high_max": 1.412869187333854e-05, + "clip_ratio/high_mean": 3.532172968334635e-06, + "clip_ratio/low_mean": 4.364474455087475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.717691729183571e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15837.0, + "completions/mean_length": 6143.3125, + "completions/mean_terminated_length": 5980.76220703125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 0.9383679181337357, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016755202086642385, + "learning_rate": 1e-05, + "loss": 0.1134, + "num_tokens": 185207315.0, + "reward": 0.40625, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.00010746628686320037, + "sampling/sampling_logp_difference/max": 9.138333320617676, + "sampling/sampling_logp_difference/mean": 0.01892942003905773, + "step": 231 + }, + { + "clip_ratio/high_max": 5.389092621044256e-06, + "clip_ratio/high_mean": 1.347273155261064e-06, + "clip_ratio/low_mean": 4.616663244405572e-05, + "clip_ratio/low_min": 5.818554200232029e-06, + "clip_ratio/region_mean": 4.7513905599316786e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16101.0, + "completions/mean_length": 6852.234375, + "completions/mean_terminated_length": 6623.47216796875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.9856249913573265, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036351638846099377, + "learning_rate": 1e-05, + "loss": 0.0413, + "num_tokens": 186104113.0, + "reward": 0.375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 0.0006267272983677685, + "sampling/sampling_logp_difference/max": 7.374999046325684, + "sampling/sampling_logp_difference/mean": 0.021776381880044937, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.837307613390294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.837307613390294e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 6634.1484375, + "completions/mean_terminated_length": 6479.38916015625, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 1.0182439163327217, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003553485032171011, + "learning_rate": 1e-05, + "loss": 0.0886, + "num_tokens": 186973796.0, + "reward": 0.34375, + "reward_std": 0.24381662905216217, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.00038018118357285857, + "sampling/sampling_logp_difference/max": 7.8748626708984375, + "sampling/sampling_logp_difference/mean": 0.02058180794119835, + "step": 233 + }, + { + "clip_ratio/high_max": 1.4436222500080476e-05, + "clip_ratio/high_mean": 3.609055625020119e-06, + "clip_ratio/low_mean": 5.134189859745675e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.495095410879003e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14911.0, + "completions/mean_length": 6424.2421875, + "completions/mean_terminated_length": 6266.1513671875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.9030232205986977, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002669632900506258, + "learning_rate": 1e-05, + "loss": 0.0828, + "num_tokens": 187820443.0, + "reward": 0.34375, + "reward_std": 0.2817176878452301, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999942183494568, + "sampling/importance_sampling_ratio/min": 0.004488746635615826, + "sampling/sampling_logp_difference/max": 5.406181812286377, + "sampling/sampling_logp_difference/mean": 0.01908625289797783, + "step": 234 + }, + { + "clip_ratio/high_max": 1.4932538306311471e-05, + "clip_ratio/high_mean": 3.733134576577868e-06, + "clip_ratio/low_mean": 2.516909023597691e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8902224585181102e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14473.0, + "completions/mean_length": 6582.21875, + "completions/mean_terminated_length": 6505.03955078125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "entropy": 0.9906348586082458, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021964670158922672, + "learning_rate": 1e-05, + "loss": 0.0122, + "num_tokens": 188682111.0, + "reward": 0.2734375, + "reward_std": 0.22908620536327362, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.01623692736029625, + "sampling/sampling_logp_difference/max": 4.9629387855529785, + "sampling/sampling_logp_difference/mean": 0.020555656403303146, + "step": 235 + }, + { + "clip_ratio/high_max": 1.3005691471335012e-05, + "clip_ratio/high_mean": 3.251422867833753e-06, + "clip_ratio/low_mean": 4.822792686809407e-05, + "clip_ratio/low_min": 4.575235379888909e-06, + "clip_ratio/region_mean": 5.147934950855415e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16333.0, + "completions/mean_length": 6687.8359375, + "completions/mean_terminated_length": 6611.48828125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.9669140502810478, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0032587468158453703, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 189556570.0, + "reward": 0.375, + "reward_std": 0.36956924200057983, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.002121176104992628, + "sampling/sampling_logp_difference/max": 6.155784606933594, + "sampling/sampling_logp_difference/mean": 0.020776130259037018, + "step": 236 + }, + { + "clip_ratio/high_max": 2.541685034884722e-05, + "clip_ratio/high_mean": 6.354212587211805e-06, + "clip_ratio/low_mean": 4.488310526085115e-05, + "clip_ratio/low_min": 4.259959951014025e-06, + "clip_ratio/region_mean": 5.123731762068928e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14579.0, + "completions/mean_length": 5933.890625, + "completions/mean_terminated_length": 5851.6064453125, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 0.777520164847374, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023373132571578026, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 190333676.0, + "reward": 0.5390625, + "reward_std": 0.3577219247817993, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999416470527649, + "sampling/importance_sampling_ratio/min": 1.3007656889385544e-05, + "sampling/sampling_logp_difference/max": 11.249972343444824, + "sampling/sampling_logp_difference/mean": 0.017036860808730125, + "step": 237 + }, + { + "clip_ratio/high_max": 9.352454981126357e-06, + "clip_ratio/high_mean": 2.3381137452815892e-06, + "clip_ratio/low_mean": 3.286883497821691e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5206948496124824e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16254.0, + "completions/mean_length": 6691.53125, + "completions/mean_terminated_length": 6537.68310546875, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "entropy": 1.0021202191710472, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033220481127500534, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 191208240.0, + "reward": 0.2265625, + "reward_std": 0.23987272381782532, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999876618385315, + "sampling/importance_sampling_ratio/min": 0.006665683817118406, + "sampling/sampling_logp_difference/max": 5.010782718658447, + "sampling/sampling_logp_difference/mean": 0.02151130512356758, + "step": 238 + }, + { + "clip_ratio/high_max": 2.0475443307077512e-05, + "clip_ratio/high_mean": 5.118860826769378e-06, + "clip_ratio/low_mean": 4.199072691335459e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7109587512750295e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15653.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 5480.5078125, + "completions/mean_terminated_length": 5480.5078125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.774504691362381, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002824194496497512, + "learning_rate": 1e-05, + "loss": 0.0472, + "num_tokens": 191927753.0, + "reward": 0.5078125, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999160766601562, + "sampling/importance_sampling_ratio/min": 2.561557721492136e-06, + "sampling/sampling_logp_difference/max": 12.874895095825195, + "sampling/sampling_logp_difference/mean": 0.01758616417646408, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.71521939541708e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.71521939541708e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16232.0, + "completions/mean_length": 6245.171875, + "completions/mean_terminated_length": 6001.84033203125, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "entropy": 0.9671605005860329, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020431289449334145, + "learning_rate": 1e-05, + "loss": 0.0527, + "num_tokens": 192746327.0, + "reward": 0.3359375, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999756813049316, + "sampling/importance_sampling_ratio/min": 7.518127677030861e-05, + "sampling/sampling_logp_difference/max": 9.49560832977295, + "sampling/sampling_logp_difference/mean": 0.02066320925951004, + "step": 240 + }, + { + "clip_ratio/high_max": 1.1142639777972363e-05, + "clip_ratio/high_mean": 2.7856599444930907e-06, + "clip_ratio/low_mean": 4.276063509678352e-05, + "clip_ratio/low_min": 3.055412889807485e-06, + "clip_ratio/region_mean": 4.554629526865028e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16371.0, + "completions/max_terminated_length": 15709.0, + "completions/mean_length": 6828.8515625, + "completions/mean_terminated_length": 6677.38916015625, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 0.9914879351854324, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019144542748108506, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193643468.0, + "reward": 0.34375, + "reward_std": 0.3264309763908386, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000360012054443, + "sampling/importance_sampling_ratio/min": 0.0003172139695379883, + "sampling/sampling_logp_difference/max": 8.055933952331543, + "sampling/sampling_logp_difference/mean": 0.020327996462583542, + "step": 241 + }, + { + "clip_ratio/high_max": 1.3134391338098794e-05, + "clip_ratio/high_mean": 3.2835978345246986e-06, + "clip_ratio/low_mean": 5.683154779489996e-05, + "clip_ratio/low_min": 4.3356108108127955e-06, + "clip_ratio/region_mean": 6.011514608417201e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16289.0, + "completions/mean_length": 6280.125, + "completions/mean_terminated_length": 5954.193359375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.8634965419769287, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022551591973751783, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 194465324.0, + "reward": 0.46875, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999502897262573, + "sampling/importance_sampling_ratio/min": 0.003390352241694927, + "sampling/sampling_logp_difference/max": 5.686821460723877, + "sampling/sampling_logp_difference/mean": 0.019659511744976044, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.619306153268553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619306153268553e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15646.0, + "completions/mean_length": 6910.5625, + "completions/mean_terminated_length": 6525.46337890625, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.9886282533407211, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012924466282129288, + "learning_rate": 1e-05, + "loss": 0.0753, + "num_tokens": 195369580.0, + "reward": 0.3984375, + "reward_std": 0.2590838074684143, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000083327293396, + "sampling/importance_sampling_ratio/min": 1.0787954124680255e-05, + "sampling/sampling_logp_difference/max": 11.437080383300781, + "sampling/sampling_logp_difference/mean": 0.020975295454263687, + "step": 243 + }, + { + "clip_ratio/high_max": 1.377244143441203e-05, + "clip_ratio/high_mean": 3.4431103586030076e-06, + "clip_ratio/low_mean": 2.4107489650759817e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7550600123049662e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12768.0, + "completions/mean_length": 5647.53125, + "completions/mean_terminated_length": 5562.9921875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.8360519111156464, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019667574670165777, + "learning_rate": 1e-05, + "loss": 0.0333, + "num_tokens": 196110328.0, + "reward": 0.4921875, + "reward_std": 0.33508312702178955, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999792575836182, + "sampling/importance_sampling_ratio/min": 0.00731487525627017, + "sampling/sampling_logp_difference/max": 4.917845249176025, + "sampling/sampling_logp_difference/mean": 0.017768483608961105, + "step": 244 + }, + { + "clip_ratio/high_max": 1.114784731726104e-05, + "clip_ratio/high_mean": 2.78696182931526e-06, + "clip_ratio/low_mean": 2.6054579166157055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8841540995472315e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15671.0, + "completions/mean_length": 6249.6171875, + "completions/mean_terminated_length": 6088.75439453125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.837661437690258, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017836211482062936, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 196926255.0, + "reward": 0.4453125, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443888664246, + "sampling/importance_sampling_ratio/min": 8.313281432492658e-05, + "sampling/sampling_logp_difference/max": 9.395071029663086, + "sampling/sampling_logp_difference/mean": 0.018142729997634888, + "step": 245 + }, + { + "clip_ratio/high_max": 3.1028919238451635e-06, + "clip_ratio/high_mean": 7.757229809612909e-07, + "clip_ratio/low_mean": 5.6368714012933196e-05, + "clip_ratio/low_min": 5.583348411164479e-06, + "clip_ratio/region_mean": 5.7144436595990555e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14663.0, + "completions/mean_length": 5561.796875, + "completions/mean_terminated_length": 5476.58251953125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 1.0337117239832878, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032067650463432074, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 197657021.0, + "reward": 0.421875, + "reward_std": 0.3603675961494446, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000261068344116, + "sampling/importance_sampling_ratio/min": 0.0026236141566187143, + "sampling/sampling_logp_difference/max": 5.943202495574951, + "sampling/sampling_logp_difference/mean": 0.02046290785074234, + "step": 246 + }, + { + "clip_ratio/high_max": 2.244927713945799e-05, + "clip_ratio/high_mean": 5.612319284864498e-06, + "clip_ratio/low_mean": 3.963059293710103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5242911710374756e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14806.0, + "completions/mean_length": 7230.09375, + "completions/mean_terminated_length": 7010.400390625, + "completions/min_length": 858.0, + "completions/min_terminated_length": 858.0, + "entropy": 0.9666887000203133, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002695069881156087, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 198604673.0, + "reward": 0.390625, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999954104423523, + "sampling/importance_sampling_ratio/min": 0.004087009001523256, + "sampling/sampling_logp_difference/max": 5.499941825866699, + "sampling/sampling_logp_difference/mean": 0.021222755312919617, + "step": 247 + }, + { + "clip_ratio/high_max": 6.0509246395668015e-06, + "clip_ratio/high_mean": 3.018646339114639e-06, + "clip_ratio/low_mean": 4.125545319766388e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4274099309404846e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14964.0, + "completions/mean_length": 7186.09375, + "completions/mean_terminated_length": 7040.095703125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.9754119142889977, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014327351236715913, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 199545181.0, + "reward": 0.328125, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 3.340628245496191e-05, + "sampling/sampling_logp_difference/max": 10.306766510009766, + "sampling/sampling_logp_difference/mean": 0.02061491459608078, + "step": 248 + }, + { + "clip_ratio/high_max": 1.3521318351195077e-05, + "clip_ratio/high_mean": 3.3803295877987694e-06, + "clip_ratio/low_mean": 4.744600971662294e-05, + "clip_ratio/low_min": 4.111165708309272e-06, + "clip_ratio/region_mean": 5.08263395886388e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15812.0, + "completions/mean_length": 7464.1328125, + "completions/mean_terminated_length": 7322.5478515625, + "completions/min_length": 994.0, + "completions/min_terminated_length": 994.0, + "entropy": 1.0257701128721237, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017415130278095603, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 200521262.0, + "reward": 0.296875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000050067901611, + "sampling/importance_sampling_ratio/min": 0.004382971208542585, + "sampling/sampling_logp_difference/max": 5.430028438568115, + "sampling/sampling_logp_difference/mean": 0.02146603912115097, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6656134000168095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6656134000168095e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7929.0390625, + "completions/mean_terminated_length": 6973.2607421875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.8728866130113602, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018543615005910397, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 201553491.0, + "reward": 0.25, + "reward_std": 0.3237725794315338, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999157786369324, + "sampling/importance_sampling_ratio/min": 0.0002044498542090878, + "sampling/sampling_logp_difference/max": 8.495187759399414, + "sampling/sampling_logp_difference/mean": 0.01925993338227272, + "step": 250 + }, + { + "clip_ratio/high_max": 1.5812252968316898e-05, + "clip_ratio/high_mean": 3.9530632420792244e-06, + "clip_ratio/low_mean": 4.320342043229175e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.715648356068414e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6577.84375, + "completions/mean_terminated_length": 6261.51611328125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.759723886847496, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001268691150471568, + "learning_rate": 1e-05, + "loss": 0.117, + "num_tokens": 202411655.0, + "reward": 0.515625, + "reward_std": 0.34822866320610046, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999426603317261, + "sampling/importance_sampling_ratio/min": 0.0004213420324958861, + "sampling/sampling_logp_difference/max": 7.77206563949585, + "sampling/sampling_logp_difference/mean": 0.018232906237244606, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.175654944698181e-05, + "clip_ratio/low_min": 8.377270660275826e-06, + "clip_ratio/region_mean": 3.175654944698181e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16117.0, + "completions/max_terminated_length": 16117.0, + "completions/mean_length": 6513.65625, + "completions/mean_terminated_length": 6513.65625, + "completions/min_length": 858.0, + "completions/min_terminated_length": 858.0, + "entropy": 1.0247815549373627, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004479583352804184, + "learning_rate": 1e-05, + "loss": -0.0114, + "num_tokens": 203265811.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999909400939941, + "sampling/importance_sampling_ratio/min": 0.011329792439937592, + "sampling/sampling_logp_difference/max": 4.480319499969482, + "sampling/sampling_logp_difference/mean": 0.02229863964021206, + "step": 252 + }, + { + "clip_ratio/high_max": 5.371261522668647e-06, + "clip_ratio/high_mean": 1.3428153806671617e-06, + "clip_ratio/low_mean": 4.290480364943505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4247618916415377e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16035.0, + "completions/max_terminated_length": 16035.0, + "completions/mean_length": 6013.6171875, + "completions/mean_terminated_length": 6013.6171875, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "entropy": 0.8476304411888123, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017210334772244096, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 204054186.0, + "reward": 0.5078125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998961687088013, + "sampling/importance_sampling_ratio/min": 3.32363242705469e-06, + "sampling/sampling_logp_difference/max": 12.614452362060547, + "sampling/sampling_logp_difference/mean": 0.018720701336860657, + "step": 253 + }, + { + "clip_ratio/high_max": 1.4894108517182758e-05, + "clip_ratio/high_mean": 3.7235271292956895e-06, + "clip_ratio/low_mean": 3.136672694381559e-05, + "clip_ratio/low_min": 3.941974227927858e-06, + "clip_ratio/region_mean": 3.509025418679812e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14989.0, + "completions/max_terminated_length": 14989.0, + "completions/mean_length": 7090.2109375, + "completions/mean_terminated_length": 7090.2109375, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.9804464280605316, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003268485888838768, + "learning_rate": 1e-05, + "loss": 0.0441, + "num_tokens": 204982085.0, + "reward": 0.3828125, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999740719795227, + "sampling/importance_sampling_ratio/min": 6.605670205317438e-05, + "sampling/sampling_logp_difference/max": 9.62499713897705, + "sampling/sampling_logp_difference/mean": 0.021524619311094284, + "step": 254 + }, + { + "clip_ratio/high_max": 1.3869113445252879e-05, + "clip_ratio/high_mean": 3.4672783613132196e-06, + "clip_ratio/low_mean": 3.1164222662027896e-05, + "clip_ratio/low_min": 2.928154799519689e-06, + "clip_ratio/region_mean": 3.46315009664977e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 6272.65625, + "completions/mean_terminated_length": 6112.1591796875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.8322838544845581, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002127156127244234, + "learning_rate": 1e-05, + "loss": 0.0142, + "num_tokens": 205805529.0, + "reward": 0.4296875, + "reward_std": 0.3385029733181, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999648928642273, + "sampling/importance_sampling_ratio/min": 0.00019322636944707483, + "sampling/sampling_logp_difference/max": 8.551648139953613, + "sampling/sampling_logp_difference/mean": 0.018514126539230347, + "step": 255 + }, + { + "clip_ratio/high_max": 7.213966455310583e-06, + "clip_ratio/high_mean": 4.349803020886611e-06, + "clip_ratio/low_mean": 3.907777556833025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3427579043964215e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6422.7109375, + "completions/mean_terminated_length": 5846.43798828125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.8222996592521667, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001939435489475727, + "learning_rate": 1e-05, + "loss": 0.1001, + "num_tokens": 206647908.0, + "reward": 0.4609375, + "reward_std": 0.26143795251846313, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 6.205694808159024e-05, + "sampling/sampling_logp_difference/max": 9.687458038330078, + "sampling/sampling_logp_difference/mean": 0.018810249865055084, + "step": 256 + }, + { + "clip_ratio/high_max": 2.1247945142022218e-05, + "clip_ratio/high_mean": 6.189401005940454e-06, + "clip_ratio/low_mean": 4.7238423121598316e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342782378647826e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15664.0, + "completions/mean_length": 6179.8046875, + "completions/mean_terminated_length": 6099.45654296875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.031787522137165, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002615252509713173, + "learning_rate": 1e-05, + "loss": 0.0147, + "num_tokens": 207459043.0, + "reward": 0.5, + "reward_std": 0.3232533931732178, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 1.9359204088686965e-05, + "sampling/sampling_logp_difference/max": 10.85234260559082, + "sampling/sampling_logp_difference/mean": 0.020463883876800537, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5109407349409594e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5109407349409594e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16052.0, + "completions/mean_length": 7093.5390625, + "completions/mean_terminated_length": 6474.17529296875, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8378612920641899, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002656357828527689, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208389800.0, + "reward": 0.3828125, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998178482055664, + "sampling/importance_sampling_ratio/min": 2.1559546439675614e-05, + "sampling/sampling_logp_difference/max": 10.744691848754883, + "sampling/sampling_logp_difference/mean": 0.01860899105668068, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7354818396597693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7354818396597693e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 7782.46875, + "completions/mean_terminated_length": 7576.34423828125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 1.0068294331431389, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026847824919968843, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 209407212.0, + "reward": 0.3203125, + "reward_std": 0.2188364565372467, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 2.5824127078521997e-05, + "sampling/sampling_logp_difference/max": 10.564201354980469, + "sampling/sampling_logp_difference/mean": 0.021435359492897987, + "step": 259 + }, + { + "clip_ratio/high_max": 1.5335908301494783e-05, + "clip_ratio/high_mean": 3.833977075373696e-06, + "clip_ratio/low_mean": 3.303791140751855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6871888482892246e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 6713.3359375, + "completions/mean_terminated_length": 6637.18896484375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.8899351507425308, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019718443509191275, + "learning_rate": 1e-05, + "loss": 0.0167, + "num_tokens": 210286983.0, + "reward": 0.4140625, + "reward_std": 0.29719969630241394, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000264644622803, + "sampling/importance_sampling_ratio/min": 8.772138971835375e-05, + "sampling/sampling_logp_difference/max": 9.341344833374023, + "sampling/sampling_logp_difference/mean": 0.019354315474629402, + "step": 260 + }, + { + "clip_ratio/high_max": 2.0819897144974675e-05, + "clip_ratio/high_mean": 5.204974286243669e-06, + "clip_ratio/low_mean": 3.656347121250292e-05, + "clip_ratio/low_min": 5.0166554501629435e-06, + "clip_ratio/region_mean": 4.176844549874659e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14552.0, + "completions/mean_length": 6275.5390625, + "completions/mean_terminated_length": 6115.087890625, + "completions/min_length": 663.0, + "completions/min_terminated_length": 663.0, + "entropy": 0.901648998260498, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.0029727297369390726, + "learning_rate": 1e-05, + "loss": 0.0593, + "num_tokens": 211107380.0, + "reward": 0.40625, + "reward_std": 0.4373784065246582, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999792575836182, + "sampling/importance_sampling_ratio/min": 0.00043164435192011297, + "sampling/sampling_logp_difference/max": 7.747908592224121, + "sampling/sampling_logp_difference/mean": 0.019338306039571762, + "step": 261 + }, + { + "clip_ratio/high_max": 4.363734251455753e-05, + "clip_ratio/high_mean": 1.2403264463500818e-05, + "clip_ratio/low_mean": 4.217202859990721e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4575292381287e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 5959.7578125, + "completions/mean_terminated_length": 5877.67724609375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8542912155389786, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028311724308878183, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 211890237.0, + "reward": 0.515625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 0.0007836154545657337, + "sampling/sampling_logp_difference/max": 7.151592254638672, + "sampling/sampling_logp_difference/mean": 0.018685901537537575, + "step": 262 + }, + { + "clip_ratio/high_max": 1.514913219580194e-05, + "clip_ratio/high_mean": 3.787283048950485e-06, + "clip_ratio/low_mean": 3.2207174626819324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5994458357890835e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16008.0, + "completions/mean_length": 6882.1875, + "completions/mean_terminated_length": 6575.67724609375, + "completions/min_length": 1170.0, + "completions/min_terminated_length": 1170.0, + "entropy": 0.9642625227570534, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002143653342500329, + "learning_rate": 1e-05, + "loss": 0.0127, + "num_tokens": 212792813.0, + "reward": 0.359375, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999080896377563, + "sampling/importance_sampling_ratio/min": 0.0034667642321437597, + "sampling/sampling_logp_difference/max": 5.664533615112305, + "sampling/sampling_logp_difference/mean": 0.020183943212032318, + "step": 263 + }, + { + "clip_ratio/high_max": 1.7900180637298035e-05, + "clip_ratio/high_mean": 4.475045159324509e-06, + "clip_ratio/low_mean": 3.741970294868224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1894748392223846e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6941.8828125, + "completions/mean_terminated_length": 6715.2724609375, + "completions/min_length": 978.0, + "completions/min_terminated_length": 978.0, + "entropy": 0.9488044381141663, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014945612056180835, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 213703638.0, + "reward": 0.3984375, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999405145645142, + "sampling/importance_sampling_ratio/min": 0.0005360813229344785, + "sampling/sampling_logp_difference/max": 7.531224727630615, + "sampling/sampling_logp_difference/mean": 0.02019106224179268, + "step": 264 + }, + { + "clip_ratio/high_max": 4.028359853691654e-06, + "clip_ratio/high_mean": 1.0070899634229136e-06, + "clip_ratio/low_mean": 4.494676113608875e-05, + "clip_ratio/low_min": 3.771535375562962e-06, + "clip_ratio/region_mean": 4.595385098582483e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14403.0, + "completions/mean_length": 6453.2109375, + "completions/mean_terminated_length": 6295.57958984375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.9140987247228622, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001788914087228477, + "learning_rate": 1e-05, + "loss": 0.0573, + "num_tokens": 214551065.0, + "reward": 0.3984375, + "reward_std": 0.34245961904525757, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999093413352966, + "sampling/importance_sampling_ratio/min": 6.614608719246462e-05, + "sampling/sampling_logp_difference/max": 9.623644828796387, + "sampling/sampling_logp_difference/mean": 0.01938386633992195, + "step": 265 + }, + { + "clip_ratio/high_max": 1.3890341051592259e-05, + "clip_ratio/high_mean": 3.4725852628980647e-06, + "clip_ratio/low_mean": 2.701378042502256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0486365801607462e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16074.0, + "completions/mean_length": 7625.375, + "completions/mean_terminated_length": 7556.4091796875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.9313022494316101, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023314026184380054, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 215546625.0, + "reward": 0.3515625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 3.250058568937675e-07, + "sampling/sampling_logp_difference/max": 14.939422607421875, + "sampling/sampling_logp_difference/mean": 0.020401259884238243, + "step": 266 + }, + { + "clip_ratio/high_max": 2.9235679903649725e-05, + "clip_ratio/high_mean": 7.308919975912431e-06, + "clip_ratio/low_mean": 2.5110286742346943e-05, + "clip_ratio/low_min": 3.1065162602317287e-06, + "clip_ratio/region_mean": 3.24192064908857e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16084.0, + "completions/mean_length": 6315.3046875, + "completions/mean_terminated_length": 6155.484375, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "entropy": 0.8942855522036552, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003379981964826584, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 216377176.0, + "reward": 0.421875, + "reward_std": 0.31587696075439453, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999739527702332, + "sampling/importance_sampling_ratio/min": 0.008766444399952888, + "sampling/sampling_logp_difference/max": 4.736824035644531, + "sampling/sampling_logp_difference/mean": 0.01958339475095272, + "step": 267 + }, + { + "clip_ratio/high_max": 1.070113876266987e-05, + "clip_ratio/high_mean": 2.6752846906674677e-06, + "clip_ratio/low_mean": 3.970586050172642e-05, + "clip_ratio/low_min": 5.915619567531394e-06, + "clip_ratio/region_mean": 4.238114468080312e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15699.0, + "completions/mean_length": 7196.7109375, + "completions/mean_terminated_length": 6823.24365234375, + "completions/min_length": 741.0, + "completions/min_terminated_length": 741.0, + "entropy": 1.0663049817085266, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025235258508473635, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 217316755.0, + "reward": 0.3359375, + "reward_std": 0.2893138825893402, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999923586845398, + "sampling/importance_sampling_ratio/min": 0.0007813565316610038, + "sampling/sampling_logp_difference/max": 7.154479026794434, + "sampling/sampling_logp_difference/mean": 0.02093672752380371, + "step": 268 + }, + { + "clip_ratio/high_max": 3.7446132409968413e-05, + "clip_ratio/high_mean": 1.0083826055051759e-05, + "clip_ratio/low_mean": 5.169025735085597e-05, + "clip_ratio/low_min": 5.641812549583847e-06, + "clip_ratio/region_mean": 6.177408295116038e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16286.0, + "completions/max_terminated_length": 16286.0, + "completions/mean_length": 6770.59375, + "completions/mean_terminated_length": 6770.59375, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 1.0205552130937576, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038966729771345854, + "learning_rate": 1e-05, + "loss": 0.0849, + "num_tokens": 218203975.0, + "reward": 0.4140625, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994924068450928, + "sampling/importance_sampling_ratio/min": 2.5875104370243207e-07, + "sampling/sampling_logp_difference/max": 15.167399406433105, + "sampling/sampling_logp_difference/mean": 0.025428105145692825, + "step": 269 + }, + { + "clip_ratio/high_max": 3.3825838272605324e-06, + "clip_ratio/high_mean": 8.456459568151331e-07, + "clip_ratio/low_mean": 2.8302461942075752e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9148108296794817e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15749.0, + "completions/mean_length": 7115.6953125, + "completions/mean_terminated_length": 6968.57958984375, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "entropy": 1.0728939920663834, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025828159414231777, + "learning_rate": 1e-05, + "loss": 0.0422, + "num_tokens": 219134568.0, + "reward": 0.2890625, + "reward_std": 0.21990221738815308, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999753832817078, + "sampling/importance_sampling_ratio/min": 0.0019932277500629425, + "sampling/sampling_logp_difference/max": 6.2179999351501465, + "sampling/sampling_logp_difference/mean": 0.02109808847308159, + "step": 270 + }, + { + "clip_ratio/high_max": 8.590399147578864e-06, + "clip_ratio/high_mean": 2.147599786894716e-06, + "clip_ratio/low_mean": 4.2856369077526324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5003969148638134e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15560.0, + "completions/mean_length": 6160.125, + "completions/mean_terminated_length": 5914.75244140625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.8673425689339638, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002692030044272542, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 219943376.0, + "reward": 0.4375, + "reward_std": 0.34717273712158203, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801350593567, + "sampling/importance_sampling_ratio/min": 0.0021331151947379112, + "sampling/sampling_logp_difference/max": 6.150171756744385, + "sampling/sampling_logp_difference/mean": 0.01947931945323944, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4606903429667e-05, + "clip_ratio/low_min": 4.498344424064271e-06, + "clip_ratio/region_mean": 4.4606903429667e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14763.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5778.0234375, + "completions/mean_terminated_length": 5778.0234375, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 1.1366781443357468, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002457446651533246, + "learning_rate": 1e-05, + "loss": 0.0399, + "num_tokens": 220702603.0, + "reward": 0.3828125, + "reward_std": 0.3400956988334656, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996986985206604, + "sampling/importance_sampling_ratio/min": 1.4515491386646318e-07, + "sampling/sampling_logp_difference/max": 15.745464324951172, + "sampling/sampling_logp_difference/mean": 0.021183129400014877, + "step": 272 + }, + { + "clip_ratio/high_max": 6.248437784961425e-06, + "clip_ratio/high_mean": 2.4186024347727653e-06, + "clip_ratio/low_mean": 1.783873301519634e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.025733522259543e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 7509.078125, + "completions/mean_terminated_length": 7296.08056640625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 1.071702554821968, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002503670286387205, + "learning_rate": 1e-05, + "loss": -0.0088, + "num_tokens": 221683925.0, + "reward": 0.2734375, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.00013993355969432741, + "sampling/sampling_logp_difference/max": 8.874342918395996, + "sampling/sampling_logp_difference/mean": 0.021589912474155426, + "step": 273 + }, + { + "clip_ratio/high_max": 2.347871304664295e-05, + "clip_ratio/high_mean": 6.97559880791232e-06, + "clip_ratio/low_mean": 2.81686479866039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.514424770401092e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15153.0, + "completions/mean_length": 7383.03125, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8432145267724991, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002299589104950428, + "learning_rate": 1e-05, + "loss": 0.0212, + "num_tokens": 222648865.0, + "reward": 0.3125, + "reward_std": 0.2845909595489502, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999674558639526, + "sampling/importance_sampling_ratio/min": 2.8099755581934005e-05, + "sampling/sampling_logp_difference/max": 10.47974967956543, + "sampling/sampling_logp_difference/mean": 0.018576428294181824, + "step": 274 + }, + { + "clip_ratio/high_max": 9.285309715778567e-06, + "clip_ratio/high_mean": 3.327153194732091e-06, + "clip_ratio/low_mean": 3.823394035862293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.156109298492083e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 6628.921875, + "completions/mean_terminated_length": 6552.1103515625, + "completions/min_length": 903.0, + "completions/min_terminated_length": 903.0, + "entropy": 0.9039670825004578, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024530349764972925, + "learning_rate": 1e-05, + "loss": 0.1161, + "num_tokens": 223519175.0, + "reward": 0.59375, + "reward_std": 0.3537701964378357, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999820590019226, + "sampling/importance_sampling_ratio/min": 0.0003009368374478072, + "sampling/sampling_logp_difference/max": 8.108610153198242, + "sampling/sampling_logp_difference/mean": 0.01871109940111637, + "step": 275 + }, + { + "clip_ratio/high_max": 1.5403714087369735e-05, + "clip_ratio/high_mean": 3.850928521842434e-06, + "clip_ratio/low_mean": 3.431152225630285e-05, + "clip_ratio/low_min": 4.570718374452554e-06, + "clip_ratio/region_mean": 3.816245106236238e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16081.0, + "completions/mean_length": 7335.3359375, + "completions/mean_terminated_length": 7118.16845703125, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.8435061648488045, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019706569146364927, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 224479306.0, + "reward": 0.34375, + "reward_std": 0.28223684430122375, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 3.288762854936067e-06, + "sampling/sampling_logp_difference/max": 12.624999046325684, + "sampling/sampling_logp_difference/mean": 0.018783386796712875, + "step": 276 + }, + { + "clip_ratio/high_max": 1.979319677047897e-05, + "clip_ratio/high_mean": 4.948299192619743e-06, + "clip_ratio/low_mean": 2.4465696469633258e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9413995889626676e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16333.0, + "completions/mean_length": 6052.1953125, + "completions/mean_terminated_length": 5718.9111328125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.8186529725790024, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001542358542792499, + "learning_rate": 1e-05, + "loss": 0.0906, + "num_tokens": 225273523.0, + "reward": 0.46875, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004768371582, + "sampling/importance_sampling_ratio/min": 0.0017039870144799352, + "sampling/sampling_logp_difference/max": 6.374784469604492, + "sampling/sampling_logp_difference/mean": 0.0183861143887043, + "step": 277 + }, + { + "clip_ratio/high_max": 2.5990090307459468e-05, + "clip_ratio/high_mean": 6.497522576864867e-06, + "clip_ratio/low_mean": 5.721013076254167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.370765299834602e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13648.0, + "completions/mean_length": 6560.75, + "completions/mean_terminated_length": 6404.82568359375, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 1.0198248624801636, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002488402184098959, + "learning_rate": 1e-05, + "loss": 0.0646, + "num_tokens": 226134235.0, + "reward": 0.375, + "reward_std": 0.3805803954601288, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 5.428586973721394e-06, + "sampling/sampling_logp_difference/max": 12.123831748962402, + "sampling/sampling_logp_difference/mean": 0.020803291350603104, + "step": 278 + }, + { + "clip_ratio/high_max": 1.1638113846856868e-05, + "clip_ratio/high_mean": 2.909528461714217e-06, + "clip_ratio/low_mean": 3.2134936191141605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.504446431179531e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12694.0, + "completions/max_terminated_length": 12694.0, + "completions/mean_length": 5217.140625, + "completions/mean_terminated_length": 5217.140625, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.8947679325938225, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035258245188742876, + "learning_rate": 1e-05, + "loss": 0.1095, + "num_tokens": 226821989.0, + "reward": 0.6015625, + "reward_std": 0.4092749357223511, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998952150344849, + "sampling/importance_sampling_ratio/min": 1.0208474122919142e-05, + "sampling/sampling_logp_difference/max": 11.492292404174805, + "sampling/sampling_logp_difference/mean": 0.018339669331908226, + "step": 279 + }, + { + "clip_ratio/high_max": 1.1735807220247807e-05, + "clip_ratio/high_mean": 2.9339518050619517e-06, + "clip_ratio/low_mean": 1.676440933806589e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9698360574693652e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 7622.609375, + "completions/mean_terminated_length": 7483.5400390625, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "entropy": 0.760207436978817, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001208966481499374, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 227815683.0, + "reward": 0.4609375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998728632926941, + "sampling/importance_sampling_ratio/min": 4.0069728129310533e-05, + "sampling/sampling_logp_difference/max": 10.124889373779297, + "sampling/sampling_logp_difference/mean": 0.018406979739665985, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.5826797437057394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5826797437057394e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 5981.90625, + "completions/mean_terminated_length": 5816.7939453125, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.902967743575573, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001788424444384873, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 228599647.0, + "reward": 0.4609375, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.0013331151567399502, + "sampling/sampling_logp_difference/max": 6.620236873626709, + "sampling/sampling_logp_difference/mean": 0.018927905708551407, + "step": 281 + }, + { + "clip_ratio/high_max": 1.6327461935361498e-05, + "clip_ratio/high_mean": 4.0818654838403745e-06, + "clip_ratio/low_mean": 3.461411097305245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.86959764000494e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15850.0, + "completions/mean_length": 6156.0, + "completions/mean_terminated_length": 5993.6513671875, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "entropy": 0.8951378241181374, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0039085340686142445, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 229405495.0, + "reward": 0.5234375, + "reward_std": 0.304566353559494, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 0.007635246496647596, + "sampling/sampling_logp_difference/max": 4.8749799728393555, + "sampling/sampling_logp_difference/mean": 0.018469247967004776, + "step": 282 + }, + { + "clip_ratio/high_max": 1.3168388704798417e-05, + "clip_ratio/high_mean": 3.2920971761996043e-06, + "clip_ratio/low_mean": 3.1043596322888334e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4335693726461614e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15808.0, + "completions/mean_length": 7229.234375, + "completions/mean_terminated_length": 6933.9189453125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 1.0803911909461021, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001881407224573195, + "learning_rate": 1e-05, + "loss": 0.0616, + "num_tokens": 230350725.0, + "reward": 0.2890625, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000948905944824, + "sampling/importance_sampling_ratio/min": 3.536981239449233e-05, + "sampling/sampling_logp_difference/max": 10.249651908874512, + "sampling/sampling_logp_difference/mean": 0.021804997697472572, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.664479729399318e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.664479729399318e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16135.0, + "completions/mean_length": 7486.2734375, + "completions/mean_terminated_length": 6971.52880859375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.9674680531024933, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0015280995285138488, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 231330664.0, + "reward": 0.234375, + "reward_std": 0.22620804607868195, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999110102653503, + "sampling/importance_sampling_ratio/min": 0.010103696957230568, + "sampling/sampling_logp_difference/max": 4.59485387802124, + "sampling/sampling_logp_difference/mean": 0.02071535401046276, + "step": 284 + }, + { + "clip_ratio/high_max": 7.207103408291005e-06, + "clip_ratio/high_mean": 3.596102942537982e-06, + "clip_ratio/low_mean": 4.2366073103039525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.596217695507221e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 6439.40625, + "completions/mean_terminated_length": 6361.1025390625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "entropy": 0.8368510156869888, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024581989273428917, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 232174804.0, + "reward": 0.40625, + "reward_std": 0.3527044653892517, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999905228614807, + "sampling/importance_sampling_ratio/min": 0.0010985663393512368, + "sampling/sampling_logp_difference/max": 6.813749313354492, + "sampling/sampling_logp_difference/mean": 0.018181029707193375, + "step": 285 + }, + { + "clip_ratio/high_max": 2.0772107973243692e-05, + "clip_ratio/high_mean": 6.365107253714086e-06, + "clip_ratio/low_mean": 6.206619241311273e-05, + "clip_ratio/low_min": 1.0199641110375524e-05, + "clip_ratio/region_mean": 6.843129881417553e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15297.0, + "completions/mean_length": 6642.3984375, + "completions/mean_terminated_length": 6163.302734375, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 1.080193243920803, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026200765278190374, + "learning_rate": 1e-05, + "loss": 0.1, + "num_tokens": 233042999.0, + "reward": 0.3828125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808669090271, + "sampling/importance_sampling_ratio/min": 0.00035727949580177665, + "sampling/sampling_logp_difference/max": 7.936992168426514, + "sampling/sampling_logp_difference/mean": 0.020303232595324516, + "step": 286 + }, + { + "clip_ratio/high_max": 2.1764372377219843e-05, + "clip_ratio/high_mean": 5.441093094304961e-06, + "clip_ratio/low_mean": 8.049383222896722e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.593492520958534e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 5594.3984375, + "completions/mean_terminated_length": 5509.44091796875, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.8376244381070137, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028024003840982914, + "learning_rate": 1e-05, + "loss": 0.0317, + "num_tokens": 233778538.0, + "reward": 0.390625, + "reward_std": 0.3566610813140869, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999902844429016, + "sampling/importance_sampling_ratio/min": 0.030517347157001495, + "sampling/sampling_logp_difference/max": 3.489459991455078, + "sampling/sampling_logp_difference/mean": 0.01896265149116516, + "step": 287 + }, + { + "clip_ratio/high_max": 1.9571571556298295e-05, + "clip_ratio/high_mean": 4.892892889074574e-06, + "clip_ratio/low_mean": 1.3305952052178327e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8198844827566063e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16069.0, + "completions/mean_length": 6939.7890625, + "completions/mean_terminated_length": 6635.13671875, + "completions/min_length": 1303.0, + "completions/min_terminated_length": 1303.0, + "entropy": 0.923162192106247, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0004863851936534047, + "learning_rate": 1e-05, + "loss": 0.0663, + "num_tokens": 234683871.0, + "reward": 0.5234375, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 4.343670661910437e-05, + "sampling/sampling_logp_difference/max": 10.044205665588379, + "sampling/sampling_logp_difference/mean": 0.018946819007396698, + "step": 288 + }, + { + "clip_ratio/high_max": 2.6291640551789897e-05, + "clip_ratio/high_mean": 6.572910137947474e-06, + "clip_ratio/low_mean": 4.438247970028897e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0955390179296955e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15671.0, + "completions/mean_length": 5808.1796875, + "completions/mean_terminated_length": 5640.31005859375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.8330265805125237, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003028205828741193, + "learning_rate": 1e-05, + "loss": 0.0318, + "num_tokens": 235446758.0, + "reward": 0.5078125, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 0.05524001643061638, + "sampling/sampling_logp_difference/max": 3.001615524291992, + "sampling/sampling_logp_difference/mean": 0.018604904413223267, + "step": 289 + }, + { + "clip_ratio/high_max": 4.42854116045055e-06, + "clip_ratio/high_mean": 1.1071352901126374e-06, + "clip_ratio/low_mean": 3.1940794087859103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.30479292642849e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7106.125, + "completions/mean_terminated_length": 6806.83837890625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "entropy": 1.0014382004737854, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022615960333496332, + "learning_rate": 1e-05, + "loss": 0.0369, + "num_tokens": 236377494.0, + "reward": 0.34375, + "reward_std": 0.33614397048950195, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999083280563354, + "sampling/importance_sampling_ratio/min": 0.0008234601118601859, + "sampling/sampling_logp_difference/max": 7.101995468139648, + "sampling/sampling_logp_difference/mean": 0.02129078283905983, + "step": 290 + }, + { + "clip_ratio/high_max": 9.011766906041885e-06, + "clip_ratio/high_mean": 2.252941726510471e-06, + "clip_ratio/low_mean": 2.9379379270721984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.163232122460613e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 6830.2109375, + "completions/mean_terminated_length": 6360.35205078125, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.8726402744650841, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002122451551258564, + "learning_rate": 1e-05, + "loss": 0.0083, + "num_tokens": 237269977.0, + "reward": 0.484375, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999386072158813, + "sampling/importance_sampling_ratio/min": 0.0003835389798041433, + "sampling/sampling_logp_difference/max": 7.866069316864014, + "sampling/sampling_logp_difference/mean": 0.018967002630233765, + "step": 291 + }, + { + "clip_ratio/high_max": 3.987113814218901e-06, + "clip_ratio/high_mean": 9.967784535547253e-07, + "clip_ratio/low_mean": 2.8655875098593242e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9652653552147967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16246.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6704.171875, + "completions/mean_terminated_length": 6704.171875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.9421284720301628, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001218589604832232, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 238147359.0, + "reward": 0.3515625, + "reward_std": 0.2012200504541397, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.002478870330378413, + "sampling/sampling_logp_difference/max": 5.99995231628418, + "sampling/sampling_logp_difference/mean": 0.02092663012444973, + "step": 292 + }, + { + "clip_ratio/high_max": 8.067639100772794e-06, + "clip_ratio/high_mean": 2.0169097751931986e-06, + "clip_ratio/low_mean": 4.687528951308195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.889219928827515e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6142.8203125, + "completions/mean_terminated_length": 5639.1552734375, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.1285494044423103, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003979295492172241, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 238953104.0, + "reward": 0.265625, + "reward_std": 0.2756393849849701, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.00349772023037076, + "sampling/sampling_logp_difference/max": 5.655643939971924, + "sampling/sampling_logp_difference/mean": 0.022049173712730408, + "step": 293 + }, + { + "clip_ratio/high_max": 1.4033725619810866e-05, + "clip_ratio/high_mean": 3.5084314049527165e-06, + "clip_ratio/low_mean": 2.4028336156334262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7536767788660654e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15710.0, + "completions/mean_length": 5622.296875, + "completions/mean_terminated_length": 5275.14501953125, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "entropy": 0.9032362103462219, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022260278929024935, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 239699350.0, + "reward": 0.53125, + "reward_std": 0.2748701572418213, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663829803467, + "sampling/importance_sampling_ratio/min": 9.907654748531058e-05, + "sampling/sampling_logp_difference/max": 9.21961784362793, + "sampling/sampling_logp_difference/mean": 0.018553178757429123, + "step": 294 + }, + { + "clip_ratio/high_max": 2.0970909417883377e-05, + "clip_ratio/high_mean": 7.081109117734741e-06, + "clip_ratio/low_mean": 2.478300689290336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.186411640854203e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15532.0, + "completions/mean_length": 7203.6640625, + "completions/mean_terminated_length": 6752.171875, + "completions/min_length": 1073.0, + "completions/min_terminated_length": 1073.0, + "entropy": 0.9958974272012711, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001666489290073514, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 240640387.0, + "reward": 0.484375, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999366998672485, + "sampling/importance_sampling_ratio/min": 0.003141714259982109, + "sampling/sampling_logp_difference/max": 5.762986660003662, + "sampling/sampling_logp_difference/mean": 0.02084190584719181, + "step": 295 + }, + { + "clip_ratio/high_max": 2.8518336421257118e-05, + "clip_ratio/high_mean": 1.1702542110469949e-05, + "clip_ratio/low_mean": 4.6755864048009244e-05, + "clip_ratio/low_min": 9.262003914045636e-06, + "clip_ratio/region_mean": 5.8458407011130475e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7692.4765625, + "completions/mean_terminated_length": 7412.2578125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9312580227851868, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019504680531099439, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 241647840.0, + "reward": 0.3828125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998970031738281, + "sampling/importance_sampling_ratio/min": 0.00011594472016440704, + "sampling/sampling_logp_difference/max": 9.062397003173828, + "sampling/sampling_logp_difference/mean": 0.02081790193915367, + "step": 296 + }, + { + "clip_ratio/high_max": 2.4005360501178075e-05, + "clip_ratio/high_mean": 6.001340125294519e-06, + "clip_ratio/low_mean": 3.910731970790948e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.510866097007238e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14196.0, + "completions/mean_length": 6142.09375, + "completions/mean_terminated_length": 6061.44873046875, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "entropy": 0.8636585548520088, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025938916951417923, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 242452692.0, + "reward": 0.515625, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999980926513672, + "sampling/importance_sampling_ratio/min": 2.320722842341638e-06, + "sampling/sampling_logp_difference/max": 12.973631858825684, + "sampling/sampling_logp_difference/mean": 0.019208990037441254, + "step": 297 + }, + { + "clip_ratio/high_max": 4.168055966147222e-06, + "clip_ratio/high_mean": 1.0420139915368054e-06, + "clip_ratio/low_mean": 3.8637008401565254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.967902239310206e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16030.0, + "completions/max_terminated_length": 16030.0, + "completions/mean_length": 6112.6171875, + "completions/mean_terminated_length": 6112.6171875, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "entropy": 0.8610381335020065, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014701929176226258, + "learning_rate": 1e-05, + "loss": 0.0377, + "num_tokens": 243255243.0, + "reward": 0.3984375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271035194397, + "sampling/importance_sampling_ratio/min": 4.6073862904449925e-05, + "sampling/sampling_logp_difference/max": 9.985264778137207, + "sampling/sampling_logp_difference/mean": 0.018754754215478897, + "step": 298 + }, + { + "clip_ratio/high_max": 8.054383215494454e-06, + "clip_ratio/high_mean": 2.0135958038736135e-06, + "clip_ratio/low_mean": 4.2183424454833585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4197020486080874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16365.0, + "completions/mean_length": 7204.4375, + "completions/mean_terminated_length": 7132.1572265625, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 1.0613816231489182, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023235646076500416, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 244198291.0, + "reward": 0.3203125, + "reward_std": 0.3119252324104309, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999468326568604, + "sampling/importance_sampling_ratio/min": 3.256353693359415e-07, + "sampling/sampling_logp_difference/max": 14.937487602233887, + "sampling/sampling_logp_difference/mean": 0.02158042974770069, + "step": 299 + }, + { + "clip_ratio/high_max": 1.0963113709294703e-05, + "clip_ratio/high_mean": 3.833359528471192e-06, + "clip_ratio/low_mean": 4.1291930529041565e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5125290171199595e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16356.0, + "completions/mean_length": 6308.59375, + "completions/mean_terminated_length": 6066.7841796875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.8048126623034477, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002957145916298032, + "learning_rate": 1e-05, + "loss": 0.0926, + "num_tokens": 245022975.0, + "reward": 0.484375, + "reward_std": 0.3692649006843567, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999489188194275, + "sampling/importance_sampling_ratio/min": 0.0005304187070578337, + "sampling/sampling_logp_difference/max": 7.541843891143799, + "sampling/sampling_logp_difference/mean": 0.017426976934075356, + "step": 300 + }, + { + "clip_ratio/high_max": 1.863301304183551e-05, + "clip_ratio/high_mean": 4.658253260458878e-06, + "clip_ratio/low_mean": 7.454315527866129e-05, + "clip_ratio/low_min": 8.290224286611192e-06, + "clip_ratio/region_mean": 7.920140842543333e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6183.75, + "completions/mean_terminated_length": 5938.9443359375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.8879657089710236, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002814161591231823, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 245831183.0, + "reward": 0.46875, + "reward_std": 0.3156445026397705, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352097511292, + "sampling/importance_sampling_ratio/min": 7.562734390376136e-05, + "sampling/sampling_logp_difference/max": 9.489692687988281, + "sampling/sampling_logp_difference/mean": 0.01883331872522831, + "step": 301 + }, + { + "clip_ratio/high_max": 9.606681487639435e-06, + "clip_ratio/high_mean": 2.4016703719098587e-06, + "clip_ratio/low_mean": 3.564927715160593e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.805094752351579e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 5656.8984375, + "completions/mean_terminated_length": 5310.86279296875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8461362943053246, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00238890596665442, + "learning_rate": 1e-05, + "loss": 0.1344, + "num_tokens": 246576170.0, + "reward": 0.3984375, + "reward_std": 0.37609970569610596, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999550580978394, + "sampling/importance_sampling_ratio/min": 0.000344505300745368, + "sampling/sampling_logp_difference/max": 7.973401069641113, + "sampling/sampling_logp_difference/mean": 0.01883539929986, + "step": 302 + }, + { + "clip_ratio/high_max": 3.868412022711709e-06, + "clip_ratio/high_mean": 9.671030056779273e-07, + "clip_ratio/low_mean": 4.4275341792854306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.524244479853223e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14949.0, + "completions/mean_length": 7402.484375, + "completions/mean_terminated_length": 7331.763671875, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.9303053691983223, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002722573932260275, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 247542448.0, + "reward": 0.359375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998664259910583, + "sampling/importance_sampling_ratio/min": 0.0015035009710118175, + "sampling/sampling_logp_difference/max": 6.4999589920043945, + "sampling/sampling_logp_difference/mean": 0.020525872707366943, + "step": 303 + }, + { + "clip_ratio/high_max": 3.7332376905396814e-06, + "clip_ratio/high_mean": 9.333094226349203e-07, + "clip_ratio/low_mean": 2.2581028019885707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3514337442520628e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15655.0, + "completions/mean_length": 6920.7734375, + "completions/mean_terminated_length": 6455.36865234375, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.9233825877308846, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024008466862142086, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 248446787.0, + "reward": 0.328125, + "reward_std": 0.2359210103750229, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999996423721313, + "sampling/importance_sampling_ratio/min": 0.00010231315536657348, + "sampling/sampling_logp_difference/max": 9.187472343444824, + "sampling/sampling_logp_difference/mean": 0.01887384243309498, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1328072105243336e-05, + "clip_ratio/high_mean": 2.832018026310834e-06, + "clip_ratio/low_mean": 3.6861969306301035e-05, + "clip_ratio/low_min": 4.25054395236657e-06, + "clip_ratio/region_mean": 3.969398790104606e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 6658.7109375, + "completions/mean_terminated_length": 6504.341796875, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.9102077335119247, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016227345913648605, + "learning_rate": 1e-05, + "loss": 0.0684, + "num_tokens": 249318094.0, + "reward": 0.5078125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998591542243958, + "sampling/importance_sampling_ratio/min": 0.0038418183103203773, + "sampling/sampling_logp_difference/max": 5.561809539794922, + "sampling/sampling_logp_difference/mean": 0.019931891933083534, + "step": 305 + }, + { + "clip_ratio/high_max": 5.2942118600185495e-06, + "clip_ratio/high_mean": 1.3235529650046374e-06, + "clip_ratio/low_mean": 4.644989053304016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7773443156984285e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8597.84375, + "completions/mean_terminated_length": 8346.6767578125, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9965319409966469, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023056245408952236, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 250435674.0, + "reward": 0.296875, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.005126871634274721, + "sampling/sampling_logp_difference/max": 5.27325963973999, + "sampling/sampling_logp_difference/mean": 0.02132929116487503, + "step": 306 + }, + { + "clip_ratio/high_max": 8.388911510337493e-06, + "clip_ratio/high_mean": 2.0972278775843733e-06, + "clip_ratio/low_mean": 4.1705150920279266e-05, + "clip_ratio/low_min": 5.85781890549697e-06, + "clip_ratio/region_mean": 4.380237885470706e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14905.0, + "completions/max_terminated_length": 14905.0, + "completions/mean_length": 6053.0390625, + "completions/mean_terminated_length": 6053.0390625, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "entropy": 1.0717384740710258, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022299408446997404, + "learning_rate": 1e-05, + "loss": 0.0054, + "num_tokens": 251232847.0, + "reward": 0.3515625, + "reward_std": 0.26143795251846313, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006914138794, + "sampling/importance_sampling_ratio/min": 0.0024789744056761265, + "sampling/sampling_logp_difference/max": 5.999910354614258, + "sampling/sampling_logp_difference/mean": 0.021233227103948593, + "step": 307 + }, + { + "clip_ratio/high_max": 1.0162047374251415e-05, + "clip_ratio/high_mean": 2.5405118435628538e-06, + "clip_ratio/low_mean": 5.296576864566305e-05, + "clip_ratio/low_min": 8.864200026437175e-06, + "clip_ratio/region_mean": 5.550628043238248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15929.0, + "completions/mean_length": 6553.7109375, + "completions/mean_terminated_length": 6476.30712890625, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.9829569607973099, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026091893669217825, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 252088154.0, + "reward": 0.4140625, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999917149543762, + "sampling/importance_sampling_ratio/min": 0.0010629174066707492, + "sampling/sampling_logp_difference/max": 6.846737861633301, + "sampling/sampling_logp_difference/mean": 0.020414084196090698, + "step": 308 + }, + { + "clip_ratio/high_max": 9.021045798363048e-06, + "clip_ratio/high_mean": 2.255261449590762e-06, + "clip_ratio/low_mean": 3.9386548451147974e-05, + "clip_ratio/low_min": 4.476596132008126e-06, + "clip_ratio/region_mean": 4.1641809502834803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15218.0, + "completions/mean_length": 6391.7421875, + "completions/mean_terminated_length": 5985.552734375, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.7887687161564827, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018632705323398113, + "learning_rate": 1e-05, + "loss": 0.1007, + "num_tokens": 252926073.0, + "reward": 0.4609375, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999994158744812, + "sampling/importance_sampling_ratio/min": 0.0001141107059083879, + "sampling/sampling_logp_difference/max": 9.078341484069824, + "sampling/sampling_logp_difference/mean": 0.016558727249503136, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.388932546182332e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.388932546182332e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15492.0, + "completions/mean_length": 7519.140625, + "completions/mean_terminated_length": 7306.38427734375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.8663278818130493, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014314674772322178, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 253908571.0, + "reward": 0.296875, + "reward_std": 0.21436560153961182, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999785423278809, + "sampling/importance_sampling_ratio/min": 9.006411971768102e-08, + "sampling/sampling_logp_difference/max": 16.22274398803711, + "sampling/sampling_logp_difference/mean": 0.019052794203162193, + "step": 310 + }, + { + "clip_ratio/high_max": 4.941101906297263e-06, + "clip_ratio/high_mean": 1.2352754765743157e-06, + "clip_ratio/low_mean": 1.9741319533750357e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0976595237698348e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15343.0, + "completions/max_terminated_length": 15343.0, + "completions/mean_length": 5273.7265625, + "completions/mean_terminated_length": 5273.7265625, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.973240926861763, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00404210714623332, + "learning_rate": 1e-05, + "loss": 0.0706, + "num_tokens": 254601856.0, + "reward": 0.4921875, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999933123588562, + "sampling/importance_sampling_ratio/min": 5.1447856094455346e-05, + "sampling/sampling_logp_difference/max": 9.8749418258667, + "sampling/sampling_logp_difference/mean": 0.01859421283006668, + "step": 311 + }, + { + "clip_ratio/high_max": 9.725902600621339e-06, + "clip_ratio/high_mean": 2.4314756501553347e-06, + "clip_ratio/low_mean": 2.9865542501283926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2297018492499774e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 6674.5390625, + "completions/mean_terminated_length": 6598.08642578125, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "entropy": 0.9493648260831833, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003681440372020006, + "learning_rate": 1e-05, + "loss": 0.0347, + "num_tokens": 255474357.0, + "reward": 0.359375, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998538494110107, + "sampling/importance_sampling_ratio/min": 4.5425484131556004e-05, + "sampling/sampling_logp_difference/max": 9.99943733215332, + "sampling/sampling_logp_difference/mean": 0.020322658121585846, + "step": 312 + }, + { + "clip_ratio/high_max": 1.3442999488688656e-05, + "clip_ratio/high_mean": 4.46992856950601e-06, + "clip_ratio/low_mean": 4.9175514504895546e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3645443131244974e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7113.59375, + "completions/mean_terminated_length": 6736.74755859375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.8717286512255669, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014825655380263925, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 256405745.0, + "reward": 0.3984375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999269247055054, + "sampling/importance_sampling_ratio/min": 0.0015039225108921528, + "sampling/sampling_logp_difference/max": 6.499678611755371, + "sampling/sampling_logp_difference/mean": 0.019822338595986366, + "step": 313 + }, + { + "clip_ratio/high_max": 2.0328425534898997e-05, + "clip_ratio/high_mean": 6.525457763473241e-06, + "clip_ratio/low_mean": 1.983899721835769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.636445498183093e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15655.0, + "completions/mean_length": 5819.9765625, + "completions/mean_terminated_length": 5736.79541015625, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.9206694886088371, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002185023855417967, + "learning_rate": 1e-05, + "loss": 0.0957, + "num_tokens": 257171214.0, + "reward": 0.4375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 0.0011616232804954052, + "sampling/sampling_logp_difference/max": 6.757936954498291, + "sampling/sampling_logp_difference/mean": 0.018492478877305984, + "step": 314 + }, + { + "clip_ratio/high_max": 2.2664371726932586e-05, + "clip_ratio/high_mean": 6.88441667762163e-06, + "clip_ratio/low_mean": 4.306056735003949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.994498453925189e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16170.0, + "completions/mean_length": 6754.7109375, + "completions/mean_terminated_length": 6523.6083984375, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "entropy": 0.8881036639213562, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022363397292792797, + "learning_rate": 1e-05, + "loss": 0.1086, + "num_tokens": 258064049.0, + "reward": 0.5234375, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0005261205951683223, + "sampling/sampling_logp_difference/max": 7.549980163574219, + "sampling/sampling_logp_difference/mean": 0.01989433914422989, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.3297232107543095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3297232107543095e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15599.0, + "completions/mean_length": 7953.421875, + "completions/mean_terminated_length": 7610.71533203125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9007300287485123, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001413302612490952, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 259098655.0, + "reward": 0.3203125, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 0.00017562482389621437, + "sampling/sampling_logp_difference/max": 8.647160530090332, + "sampling/sampling_logp_difference/mean": 0.019421691074967384, + "step": 316 + }, + { + "clip_ratio/high_max": 3.664743485387589e-05, + "clip_ratio/high_mean": 1.2026366050577053e-05, + "clip_ratio/low_mean": 3.211230455235636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4138670659776835e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15430.0, + "completions/mean_length": 6669.390625, + "completions/mean_terminated_length": 6515.19091796875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.8598581254482269, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018268795683979988, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 259971017.0, + "reward": 0.4453125, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 1.7517091066565627e-07, + "sampling/sampling_logp_difference/max": 15.557503700256348, + "sampling/sampling_logp_difference/mean": 0.01863129623234272, + "step": 317 + }, + { + "clip_ratio/high_max": 5.219860668148613e-06, + "clip_ratio/high_mean": 1.3049651670371532e-06, + "clip_ratio/low_mean": 2.3785564053468988e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509052933419298e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11342.0, + "completions/max_terminated_length": 11342.0, + "completions/mean_length": 5268.2890625, + "completions/mean_terminated_length": 5268.2890625, + "completions/min_length": 818.0, + "completions/min_terminated_length": 818.0, + "entropy": 0.8647450804710388, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027839087415486574, + "learning_rate": 1e-05, + "loss": 0.1259, + "num_tokens": 260663534.0, + "reward": 0.6171875, + "reward_std": 0.3345640003681183, + "rewards/accuracy_reward/mean": 0.6171875, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998882412910461, + "sampling/importance_sampling_ratio/min": 0.008392918854951859, + "sampling/sampling_logp_difference/max": 4.780366897583008, + "sampling/sampling_logp_difference/mean": 0.017936093732714653, + "step": 318 + }, + { + "clip_ratio/high_max": 3.5293785458634375e-06, + "clip_ratio/high_mean": 8.823446364658594e-07, + "clip_ratio/low_mean": 3.2431569934487925e-05, + "clip_ratio/low_min": 3.789371476159431e-06, + "clip_ratio/region_mean": 3.331391440042353e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14955.0, + "completions/mean_length": 7037.0, + "completions/mean_terminated_length": 6496.26416015625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9258207008242607, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002726807491853833, + "learning_rate": 1e-05, + "loss": 0.1071, + "num_tokens": 261583222.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999408721923828, + "sampling/importance_sampling_ratio/min": 0.0004893821314908564, + "sampling/sampling_logp_difference/max": 7.622366905212402, + "sampling/sampling_logp_difference/mean": 0.019336845725774765, + "step": 319 + }, + { + "clip_ratio/high_max": 3.219348491256824e-05, + "clip_ratio/high_mean": 8.04837122814206e-06, + "clip_ratio/low_mean": 3.258790718518867e-05, + "clip_ratio/low_min": 6.961073722777655e-06, + "clip_ratio/region_mean": 4.0636279095451755e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 6469.78125, + "completions/mean_terminated_length": 6391.71630859375, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "entropy": 0.9932648614048958, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00209408369846642, + "learning_rate": 1e-05, + "loss": 0.0446, + "num_tokens": 262430162.0, + "reward": 0.375, + "reward_std": 0.3640199303627014, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999074339866638, + "sampling/importance_sampling_ratio/min": 0.003386466298252344, + "sampling/sampling_logp_difference/max": 5.6879682540893555, + "sampling/sampling_logp_difference/mean": 0.020799942314624786, + "step": 320 + }, + { + "clip_ratio/high_max": 2.827135813276982e-05, + "clip_ratio/high_mean": 8.08931497431331e-06, + "clip_ratio/low_mean": 4.0315980186278466e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.840529436478391e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15815.0, + "completions/max_terminated_length": 15815.0, + "completions/mean_length": 5471.6953125, + "completions/mean_terminated_length": 5471.6953125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.979861818253994, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032934497576206923, + "learning_rate": 1e-05, + "loss": 0.0511, + "num_tokens": 263148331.0, + "reward": 0.4453125, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000145435333252, + "sampling/importance_sampling_ratio/min": 4.68981761514442e-06, + "sampling/sampling_logp_difference/max": 12.270116806030273, + "sampling/sampling_logp_difference/mean": 0.019479844719171524, + "step": 321 + }, + { + "clip_ratio/high_max": 1.3237559869594406e-05, + "clip_ratio/high_mean": 3.3093899673986016e-06, + "clip_ratio/low_mean": 5.419432636699639e-05, + "clip_ratio/low_min": 3.509559974190779e-06, + "clip_ratio/region_mean": 5.750371656176867e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16121.0, + "completions/mean_length": 6640.65625, + "completions/mean_terminated_length": 6161.47509765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8560378029942513, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014544804580509663, + "learning_rate": 1e-05, + "loss": 0.1159, + "num_tokens": 264017391.0, + "reward": 0.515625, + "reward_std": 0.31983357667922974, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999976396560669, + "sampling/importance_sampling_ratio/min": 0.00810791365802288, + "sampling/sampling_logp_difference/max": 4.814914703369141, + "sampling/sampling_logp_difference/mean": 0.01882140152156353, + "step": 322 + }, + { + "clip_ratio/high_max": 3.979497705586255e-06, + "clip_ratio/high_mean": 9.948744263965636e-07, + "clip_ratio/low_mean": 3.569043906281877e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.668531348921533e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16249.0, + "completions/mean_length": 5950.7421875, + "completions/mean_terminated_length": 5700.34423828125, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "entropy": 0.9033292010426521, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001294711953960359, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 264799326.0, + "reward": 0.5546875, + "reward_std": 0.22621294856071472, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000641345977783, + "sampling/importance_sampling_ratio/min": 0.0011992956278845668, + "sampling/sampling_logp_difference/max": 6.726020812988281, + "sampling/sampling_logp_difference/mean": 0.019538050517439842, + "step": 323 + }, + { + "clip_ratio/high_max": 3.0064740258239908e-05, + "clip_ratio/high_mean": 7.516185064559977e-06, + "clip_ratio/low_mean": 3.826810700502392e-05, + "clip_ratio/low_min": 4.875575541518629e-06, + "clip_ratio/region_mean": 4.578429286539176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15068.0, + "completions/mean_length": 6356.0703125, + "completions/mean_terminated_length": 6196.89697265625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.8268664851784706, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022473863791674376, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 265630895.0, + "reward": 0.4375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 2.5895053113345057e-05, + "sampling/sampling_logp_difference/max": 10.561458587646484, + "sampling/sampling_logp_difference/mean": 0.01843554526567459, + "step": 324 + }, + { + "clip_ratio/high_max": 1.8887641999754123e-05, + "clip_ratio/high_mean": 5.5906657507875934e-06, + "clip_ratio/low_mean": 7.594743829031358e-05, + "clip_ratio/low_min": 8.592850917921169e-06, + "clip_ratio/region_mean": 8.153810449584853e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7955.546875, + "completions/mean_terminated_length": 7821.76220703125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.9475079327821732, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023036333732306957, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 266666285.0, + "reward": 0.421875, + "reward_std": 0.36008089780807495, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 1.0642166614616144e-07, + "sampling/sampling_logp_difference/max": 16.055856704711914, + "sampling/sampling_logp_difference/mean": 0.020778125151991844, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9688118729609414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9688118729609414e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16247.0, + "completions/mean_length": 7701.7578125, + "completions/mean_terminated_length": 6965.974609375, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.8349794074892998, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020953568164259195, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 267669230.0, + "reward": 0.46875, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999356269836426, + "sampling/importance_sampling_ratio/min": 0.010210023261606693, + "sampling/sampling_logp_difference/max": 4.584385395050049, + "sampling/sampling_logp_difference/mean": 0.018453046679496765, + "step": 326 + }, + { + "clip_ratio/high_max": 1.9330177565279882e-05, + "clip_ratio/high_mean": 4.832544391319971e-06, + "clip_ratio/low_mean": 3.980111284818122e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4633657012127514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16374.0, + "completions/mean_length": 7335.40625, + "completions/mean_terminated_length": 7118.240234375, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "entropy": 0.9238340929150581, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016563549870625138, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 268627714.0, + "reward": 0.390625, + "reward_std": 0.32036250829696655, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.0011709382524713874, + "sampling/sampling_logp_difference/max": 6.749949932098389, + "sampling/sampling_logp_difference/mean": 0.019696014001965523, + "step": 327 + }, + { + "clip_ratio/high_max": 1.5036271179269534e-05, + "clip_ratio/high_mean": 3.7590677948173834e-06, + "clip_ratio/low_mean": 4.6864498017384904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.062356603957596e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15040.0, + "completions/max_terminated_length": 15040.0, + "completions/mean_length": 6259.875, + "completions/mean_terminated_length": 6259.875, + "completions/min_length": 1012.0, + "completions/min_terminated_length": 1012.0, + "entropy": 1.0842352360486984, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017849374562501907, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 269447338.0, + "reward": 0.3984375, + "reward_std": 0.2977364957332611, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998852014541626, + "sampling/importance_sampling_ratio/min": 0.009620909579098225, + "sampling/sampling_logp_difference/max": 4.6438164710998535, + "sampling/sampling_logp_difference/mean": 0.020421095192432404, + "step": 328 + }, + { + "clip_ratio/high_max": 1.4728739188285545e-05, + "clip_ratio/high_mean": 3.682184797071386e-06, + "clip_ratio/low_mean": 2.7205874630453764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.08880598822725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15784.0, + "completions/max_terminated_length": 15784.0, + "completions/mean_length": 7626.125, + "completions/mean_terminated_length": 7626.125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.1077729761600494, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017999790143221617, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 270444594.0, + "reward": 0.390625, + "reward_std": 0.24381662905216217, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991375207901, + "sampling/importance_sampling_ratio/min": 2.4265028741865535e-07, + "sampling/sampling_logp_difference/max": 15.231644630432129, + "sampling/sampling_logp_difference/mean": 0.021409697830677032, + "step": 329 + }, + { + "clip_ratio/high_max": 1.5701789834565716e-05, + "clip_ratio/high_mean": 3.925447458641429e-06, + "clip_ratio/low_mean": 3.2665291655575857e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.659073934159096e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15531.0, + "completions/max_terminated_length": 15531.0, + "completions/mean_length": 5581.5625, + "completions/mean_terminated_length": 5581.5625, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.8401889503002167, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031031551770865917, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 271177242.0, + "reward": 0.625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999693036079407, + "sampling/importance_sampling_ratio/min": 0.00020852939633186907, + "sampling/sampling_logp_difference/max": 8.475430488586426, + "sampling/sampling_logp_difference/mean": 0.017869479954242706, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.981169902544934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.981169902544934e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15973.0, + "completions/mean_length": 6442.84375, + "completions/mean_terminated_length": 6364.56689453125, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "entropy": 0.8304163441061974, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002635185606777668, + "learning_rate": 1e-05, + "loss": 0.037, + "num_tokens": 272021830.0, + "reward": 0.4609375, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 0.0004586660652421415, + "sampling/sampling_logp_difference/max": 7.687188148498535, + "sampling/sampling_logp_difference/mean": 0.01730487309396267, + "step": 331 + }, + { + "clip_ratio/high_max": 2.2348198399413377e-05, + "clip_ratio/high_mean": 6.557516371685779e-06, + "clip_ratio/low_mean": 5.170885208372056e-05, + "clip_ratio/low_min": 4.756469024869148e-06, + "clip_ratio/region_mean": 5.826636891015369e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15868.0, + "completions/mean_length": 6052.265625, + "completions/mean_terminated_length": 5888.27001953125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.9033217504620552, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031849017832428217, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 272818080.0, + "reward": 0.3359375, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999919533729553, + "sampling/importance_sampling_ratio/min": 2.2380504560715053e-07, + "sampling/sampling_logp_difference/max": 15.312490463256836, + "sampling/sampling_logp_difference/mean": 0.019191090017557144, + "step": 332 + }, + { + "clip_ratio/high_max": 3.71780379282427e-06, + "clip_ratio/high_mean": 9.294509482060676e-07, + "clip_ratio/low_mean": 6.115805626905058e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.20875071035698e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16068.0, + "completions/max_terminated_length": 16068.0, + "completions/mean_length": 6337.5859375, + "completions/mean_terminated_length": 6337.5859375, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 1.0558827072381973, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002086545340716839, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 273648579.0, + "reward": 0.3203125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000107288360596, + "sampling/importance_sampling_ratio/min": 7.982287934282795e-05, + "sampling/sampling_logp_difference/max": 9.435700416564941, + "sampling/sampling_logp_difference/mean": 0.021268527954816818, + "step": 333 + }, + { + "clip_ratio/high_max": 1.228984365297947e-05, + "clip_ratio/high_mean": 3.0724609132448677e-06, + "clip_ratio/low_mean": 3.2620800709537434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.56932616227823e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15556.0, + "completions/mean_length": 6439.78125, + "completions/mean_terminated_length": 6361.48046875, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "entropy": 0.989262692630291, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002226081909611821, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 274493159.0, + "reward": 0.3984375, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000194311141968, + "sampling/importance_sampling_ratio/min": 0.03169185668230057, + "sampling/sampling_logp_difference/max": 3.451695442199707, + "sampling/sampling_logp_difference/mean": 0.019788069650530815, + "step": 334 + }, + { + "clip_ratio/high_max": 7.10556764715875e-06, + "clip_ratio/high_mean": 1.7763919117896876e-06, + "clip_ratio/low_mean": 3.469589137239382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.647228299996641e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 7641.5234375, + "completions/mean_terminated_length": 7572.68505859375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 1.1427540630102158, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022452943958342075, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 275490762.0, + "reward": 0.203125, + "reward_std": 0.2567248046398163, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0003476575657259673, + "sampling/sampling_logp_difference/max": 7.964292526245117, + "sampling/sampling_logp_difference/mean": 0.022936880588531494, + "step": 335 + }, + { + "clip_ratio/high_max": 3.430955530348001e-06, + "clip_ratio/high_mean": 8.577388825870003e-07, + "clip_ratio/low_mean": 1.611294828762766e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6970687056527822e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15717.0, + "completions/mean_length": 6291.046875, + "completions/mean_terminated_length": 6211.57470703125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.1789169162511826, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.001387307420372963, + "learning_rate": 1e-05, + "loss": -0.0026, + "num_tokens": 276314904.0, + "reward": 0.28125, + "reward_std": 0.1712273508310318, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000487565994263, + "sampling/importance_sampling_ratio/min": 0.012205099686980247, + "sampling/sampling_logp_difference/max": 4.4059014320373535, + "sampling/sampling_logp_difference/mean": 0.020597899332642555, + "step": 336 + }, + { + "clip_ratio/high_max": 1.1513505342009012e-05, + "clip_ratio/high_mean": 2.878376335502253e-06, + "clip_ratio/low_mean": 5.239053416516981e-05, + "clip_ratio/low_min": 5.946967576164752e-06, + "clip_ratio/region_mean": 5.526891072804574e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15962.0, + "completions/mean_length": 7677.5, + "completions/mean_terminated_length": 7019.025390625, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9808845967054367, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018187003443017602, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 277320888.0, + "reward": 0.25, + "reward_std": 0.2880108058452606, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999768733978271, + "sampling/importance_sampling_ratio/min": 0.0001234103983733803, + "sampling/sampling_logp_difference/max": 8.999995231628418, + "sampling/sampling_logp_difference/mean": 0.0210642758756876, + "step": 337 + }, + { + "clip_ratio/high_max": 1.7702866443869425e-05, + "clip_ratio/high_mean": 4.425716610967356e-06, + "clip_ratio/low_mean": 4.517976913120947e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.960548540111631e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15783.0, + "completions/mean_length": 7066.1484375, + "completions/mean_terminated_length": 6992.779296875, + "completions/min_length": 580.0, + "completions/min_terminated_length": 580.0, + "entropy": 1.0734655261039734, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019406796200200915, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 278245739.0, + "reward": 0.3359375, + "reward_std": 0.29249146580696106, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.004089824389666319, + "sampling/sampling_logp_difference/max": 5.499253273010254, + "sampling/sampling_logp_difference/mean": 0.020316962152719498, + "step": 338 + }, + { + "clip_ratio/high_max": 1.661570968281012e-05, + "clip_ratio/high_mean": 5.1870877086912515e-06, + "clip_ratio/low_mean": 1.647002238769346e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.165711032375839e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14474.0, + "completions/max_terminated_length": 14474.0, + "completions/mean_length": 5187.5078125, + "completions/mean_terminated_length": 5187.5078125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.9958596602082253, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023044368717819452, + "learning_rate": 1e-05, + "loss": -0.002, + "num_tokens": 278933796.0, + "reward": 0.453125, + "reward_std": 0.22331714630126953, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999975562095642, + "sampling/importance_sampling_ratio/min": 1.0969968570861965e-05, + "sampling/sampling_logp_difference/max": 11.42034912109375, + "sampling/sampling_logp_difference/mean": 0.019379254430532455, + "step": 339 + }, + { + "clip_ratio/high_max": 1.5325686035794206e-05, + "clip_ratio/high_mean": 3.8314215089485515e-06, + "clip_ratio/low_mean": 2.3057583121044445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.688900440261932e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15550.0, + "completions/mean_length": 6871.0859375, + "completions/mean_terminated_length": 6484.3818359375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.8953125178813934, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026841885410249233, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 279832175.0, + "reward": 0.4296875, + "reward_std": 0.3595392107963562, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001311302185, + "sampling/importance_sampling_ratio/min": 0.004663798026740551, + "sampling/sampling_logp_difference/max": 5.36792516708374, + "sampling/sampling_logp_difference/mean": 0.019127724692225456, + "step": 340 + }, + { + "clip_ratio/high_max": 1.315804820478661e-05, + "clip_ratio/high_mean": 4.150227596255718e-06, + "clip_ratio/low_mean": 3.6840762675183214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0990990044065256e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14255.0, + "completions/mean_length": 6459.2109375, + "completions/mean_terminated_length": 6381.06298828125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8647114709019661, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014444541884586215, + "learning_rate": 1e-05, + "loss": 0.0198, + "num_tokens": 280678482.0, + "reward": 0.2734375, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999886751174927, + "sampling/importance_sampling_ratio/min": 0.0019316815305501223, + "sampling/sampling_logp_difference/max": 6.249364376068115, + "sampling/sampling_logp_difference/mean": 0.01974722556769848, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.500776003624196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.500776003624196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16081.0, + "completions/mean_length": 6280.0546875, + "completions/mean_terminated_length": 6037.56005859375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.9132707491517067, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001992191653698683, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 281499753.0, + "reward": 0.375, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999694228172302, + "sampling/importance_sampling_ratio/min": 2.558048436185345e-05, + "sampling/sampling_logp_difference/max": 10.573680877685547, + "sampling/sampling_logp_difference/mean": 0.01896769367158413, + "step": 342 + }, + { + "clip_ratio/high_max": 1.2855523436883232e-05, + "clip_ratio/high_mean": 3.213880859220808e-06, + "clip_ratio/low_mean": 2.9316923928490723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2530804674024694e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 6220.578125, + "completions/mean_terminated_length": 5892.7255859375, + "completions/min_length": 798.0, + "completions/min_terminated_length": 798.0, + "entropy": 0.8257150128483772, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003750045085325837, + "learning_rate": 1e-05, + "loss": 0.0631, + "num_tokens": 282316795.0, + "reward": 0.515625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999854564666748, + "sampling/importance_sampling_ratio/min": 2.2095075280503806e-07, + "sampling/sampling_logp_difference/max": 15.325325965881348, + "sampling/sampling_logp_difference/mean": 0.017498498782515526, + "step": 343 + }, + { + "clip_ratio/high_max": 9.090150342672132e-06, + "clip_ratio/high_mean": 2.272537585668033e-06, + "clip_ratio/low_mean": 5.6543332675573765e-05, + "clip_ratio/low_min": 4.705262199422577e-06, + "clip_ratio/region_mean": 5.881586980649445e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16134.0, + "completions/mean_length": 6845.09375, + "completions/mean_terminated_length": 6693.68310546875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.9700654074549675, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002124012913554907, + "learning_rate": 1e-05, + "loss": 0.0657, + "num_tokens": 283212095.0, + "reward": 0.4296875, + "reward_std": 0.3527093529701233, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914169311523, + "sampling/importance_sampling_ratio/min": 4.450856749826926e-07, + "sampling/sampling_logp_difference/max": 14.624999046325684, + "sampling/sampling_logp_difference/mean": 0.02086886763572693, + "step": 344 + }, + { + "clip_ratio/high_max": 4.2354217839601915e-06, + "clip_ratio/high_mean": 1.0588554459900479e-06, + "clip_ratio/low_mean": 5.4464956633637485e-05, + "clip_ratio/low_min": 7.402143637591507e-06, + "clip_ratio/region_mean": 5.552381219331437e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15416.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 4986.3828125, + "completions/mean_terminated_length": 4986.3828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9103464111685753, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035143878776580095, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 283871808.0, + "reward": 0.4296875, + "reward_std": 0.40715324878692627, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999771118164062, + "sampling/importance_sampling_ratio/min": 0.0028091762214899063, + "sampling/sampling_logp_difference/max": 5.874864101409912, + "sampling/sampling_logp_difference/mean": 0.01833461783826351, + "step": 345 + }, + { + "clip_ratio/high_max": 1.915729558277235e-05, + "clip_ratio/high_mean": 4.789323895693087e-06, + "clip_ratio/low_mean": 2.4886074015739723e-05, + "clip_ratio/low_min": 2.922677595051937e-06, + "clip_ratio/region_mean": 2.9675398081963067e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15954.0, + "completions/mean_length": 6467.9921875, + "completions/mean_terminated_length": 6310.595703125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.926672600209713, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014899170491844416, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 284718943.0, + "reward": 0.390625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999134540557861, + "sampling/importance_sampling_ratio/min": 0.00027431987109594047, + "sampling/sampling_logp_difference/max": 8.201215744018555, + "sampling/sampling_logp_difference/mean": 0.01909649185836315, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.792281761936465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.792281761936465e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15890.0, + "completions/mean_length": 6009.3671875, + "completions/mean_terminated_length": 5927.67724609375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 1.0197014585137367, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001638311194255948, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 285507622.0, + "reward": 0.4140625, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998466968536377, + "sampling/importance_sampling_ratio/min": 2.144540849258192e-05, + "sampling/sampling_logp_difference/max": 10.75, + "sampling/sampling_logp_difference/mean": 0.0198800191283226, + "step": 347 + }, + { + "clip_ratio/high_max": 1.3140848295734031e-05, + "clip_ratio/high_mean": 3.2852120739335078e-06, + "clip_ratio/low_mean": 5.1451362480747775e-05, + "clip_ratio/low_min": 7.097433353919769e-06, + "clip_ratio/region_mean": 5.473657506627205e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15174.0, + "completions/max_terminated_length": 15174.0, + "completions/mean_length": 6360.421875, + "completions/mean_terminated_length": 6360.421875, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.9253586605191231, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017278637969866395, + "learning_rate": 1e-05, + "loss": 0.0638, + "num_tokens": 286341012.0, + "reward": 0.390625, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998660087585449, + "sampling/importance_sampling_ratio/min": 5.007527215639129e-05, + "sampling/sampling_logp_difference/max": 9.901983261108398, + "sampling/sampling_logp_difference/mean": 0.02024514600634575, + "step": 348 + }, + { + "clip_ratio/high_max": 2.1974663468427025e-05, + "clip_ratio/high_mean": 6.800322353228694e-06, + "clip_ratio/low_mean": 3.598067922894188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.27810022642916e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16158.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 5470.5234375, + "completions/mean_terminated_length": 5470.5234375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.9031187370419502, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00106104149017483, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 287065039.0, + "reward": 0.3828125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999252557754517, + "sampling/importance_sampling_ratio/min": 1.6605448536211043e-06, + "sampling/sampling_logp_difference/max": 13.308364868164062, + "sampling/sampling_logp_difference/mean": 0.018382512032985687, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.3466772088577272e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3466772088577272e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15190.0, + "completions/max_terminated_length": 15190.0, + "completions/mean_length": 5533.265625, + "completions/mean_terminated_length": 5533.265625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 1.0052079856395721, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033145309425890446, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 287793249.0, + "reward": 0.484375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.04231228679418564, + "sampling/sampling_logp_difference/max": 3.162677764892578, + "sampling/sampling_logp_difference/mean": 0.020278627052903175, + "step": 350 + }, + { + "clip_ratio/high_max": 3.310516694909893e-05, + "clip_ratio/high_mean": 8.276291737274732e-06, + "clip_ratio/low_mean": 3.8735864336558734e-05, + "clip_ratio/low_min": 3.0842873002256965e-06, + "clip_ratio/region_mean": 4.7012156073833467e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 6025.6796875, + "completions/mean_terminated_length": 5604.609375, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "entropy": 0.8798701837658882, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023973146453499794, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 288582232.0, + "reward": 0.453125, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998465776443481, + "sampling/importance_sampling_ratio/min": 5.531576334760757e-06, + "sampling/sampling_logp_difference/max": 12.105037689208984, + "sampling/sampling_logp_difference/mean": 0.01999252662062645, + "step": 351 + }, + { + "clip_ratio/high_max": 1.2754688668792369e-05, + "clip_ratio/high_mean": 4.434933430275123e-06, + "clip_ratio/low_mean": 2.503601820080803e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947095174476999e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14890.0, + "completions/mean_length": 6893.5390625, + "completions/mean_terminated_length": 6818.81103515625, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.8881499394774437, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016761437291279435, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 289483997.0, + "reward": 0.3515625, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.019294647499918938, + "step": 352 + }, + { + "clip_ratio/high_max": 1.8526947997088428e-05, + "clip_ratio/high_mean": 4.631736999272107e-06, + "clip_ratio/low_mean": 4.962505795447214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.425679569270869e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 6087.828125, + "completions/mean_terminated_length": 6006.755859375, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.8525711894035339, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002270620781928301, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 290282639.0, + "reward": 0.4765625, + "reward_std": 0.3645517826080322, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999156594276428, + "sampling/importance_sampling_ratio/min": 0.0006376233650371432, + "sampling/sampling_logp_difference/max": 7.357762813568115, + "sampling/sampling_logp_difference/mean": 0.01862185075879097, + "step": 353 + }, + { + "clip_ratio/high_max": 1.1926310435228515e-05, + "clip_ratio/high_mean": 2.981577608807129e-06, + "clip_ratio/low_mean": 5.369399366372818e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6675571954656334e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 7951.0, + "completions/mean_terminated_length": 7678.96728515625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.9653833135962486, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0013396133435890079, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 291320703.0, + "reward": 0.375, + "reward_std": 0.3429914712905884, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 2.4461383873131126e-05, + "sampling/sampling_logp_difference/max": 10.618414878845215, + "sampling/sampling_logp_difference/mean": 0.0205213762819767, + "step": 354 + }, + { + "clip_ratio/high_max": 1.886164773168275e-05, + "clip_ratio/high_mean": 4.715411932920688e-06, + "clip_ratio/low_mean": 4.581529401548323e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0530706175777595e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6017.2578125, + "completions/mean_terminated_length": 5852.70654296875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9492783322930336, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003194117220118642, + "learning_rate": 1e-05, + "loss": 0.0868, + "num_tokens": 292113384.0, + "reward": 0.5703125, + "reward_std": 0.36743485927581787, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999614357948303, + "sampling/importance_sampling_ratio/min": 0.004017275292426348, + "sampling/sampling_logp_difference/max": 5.517151355743408, + "sampling/sampling_logp_difference/mean": 0.02062429115176201, + "step": 355 + }, + { + "clip_ratio/high_max": 1.4877897228871007e-05, + "clip_ratio/high_mean": 3.7194743072177516e-06, + "clip_ratio/low_mean": 3.613741432673123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.985688817920163e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15690.0, + "completions/mean_length": 6696.0, + "completions/mean_terminated_length": 6619.71630859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 1.0417355075478554, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001876713940873742, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 292990600.0, + "reward": 0.34375, + "reward_std": 0.28011518716812134, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998572468757629, + "sampling/importance_sampling_ratio/min": 3.398728586034849e-05, + "sampling/sampling_logp_difference/max": 10.28952407836914, + "sampling/sampling_logp_difference/mean": 0.020289337262511253, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8955274046893464e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8955274046893464e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14436.0, + "completions/mean_length": 5184.203125, + "completions/mean_terminated_length": 5096.015625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 1.0320965945720673, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002229714998975396, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 293673106.0, + "reward": 0.375, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000356435775757, + "sampling/importance_sampling_ratio/min": 5.736888851970434e-05, + "sampling/sampling_logp_difference/max": 9.766008377075195, + "sampling/sampling_logp_difference/mean": 0.01969832368195057, + "step": 357 + }, + { + "clip_ratio/high_max": 1.2176971722510643e-05, + "clip_ratio/high_mean": 3.044242930627661e-06, + "clip_ratio/low_mean": 4.728799405029349e-05, + "clip_ratio/low_min": 5.63901312489179e-06, + "clip_ratio/region_mean": 5.033223698092115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15582.0, + "completions/mean_length": 6664.2890625, + "completions/mean_terminated_length": 6510.00830078125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8329441174864769, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001597537542693317, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 294545927.0, + "reward": 0.4609375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000386238098145, + "sampling/importance_sampling_ratio/min": 0.00012341710680630058, + "sampling/sampling_logp_difference/max": 8.999940872192383, + "sampling/sampling_logp_difference/mean": 0.018238451331853867, + "step": 358 + }, + { + "clip_ratio/high_max": 3.2730224575061584e-06, + "clip_ratio/high_mean": 8.182556143765396e-07, + "clip_ratio/low_mean": 5.867890376975993e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.94971597820404e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16322.0, + "completions/mean_length": 7486.4921875, + "completions/mean_terminated_length": 7345.26220703125, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 1.0071435943245888, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0018223393708467484, + "learning_rate": 1e-05, + "loss": 0.1035, + "num_tokens": 295523558.0, + "reward": 0.359375, + "reward_std": 0.36561262607574463, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999614357948303, + "sampling/importance_sampling_ratio/min": 8.459773198410403e-06, + "sampling/sampling_logp_difference/max": 11.680188179016113, + "sampling/sampling_logp_difference/mean": 0.021324433386325836, + "step": 359 + }, + { + "clip_ratio/high_max": 1.9864856540152687e-05, + "clip_ratio/high_mean": 4.966214135038172e-06, + "clip_ratio/low_mean": 4.498222278925823e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.994843698113982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14737.0, + "completions/mean_length": 6103.015625, + "completions/mean_terminated_length": 6022.06298828125, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "entropy": 0.9639975428581238, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002672795206308365, + "learning_rate": 1e-05, + "loss": 0.0559, + "num_tokens": 296323888.0, + "reward": 0.375, + "reward_std": 0.32589420676231384, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998803734779358, + "sampling/importance_sampling_ratio/min": 0.0057671889662742615, + "sampling/sampling_logp_difference/max": 5.1555705070495605, + "sampling/sampling_logp_difference/mean": 0.019866492599248886, + "step": 360 + }, + { + "clip_ratio/high_max": 1.1948508017667336e-05, + "clip_ratio/high_mean": 2.987127004416834e-06, + "clip_ratio/low_mean": 4.0038267286490736e-05, + "clip_ratio/low_min": 3.0986614092398668e-06, + "clip_ratio/region_mean": 4.302539394984706e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15805.0, + "completions/mean_length": 6524.640625, + "completions/mean_terminated_length": 6368.14306640625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.8653942495584488, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016479750629514456, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 297179234.0, + "reward": 0.46875, + "reward_std": 0.28011518716812134, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.0009119793539866805, + "sampling/sampling_logp_difference/max": 6.9998931884765625, + "sampling/sampling_logp_difference/mean": 0.018908966332674026, + "step": 361 + }, + { + "clip_ratio/high_max": 7.669039405300282e-06, + "clip_ratio/high_mean": 1.9172598513250705e-06, + "clip_ratio/low_mean": 2.1955054876343638e-05, + "clip_ratio/low_min": 3.4466595479898388e-06, + "clip_ratio/region_mean": 2.387231518241606e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16294.0, + "completions/mean_length": 8057.3203125, + "completions/mean_terminated_length": 7857.48046875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 1.0029005706310272, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018210343550890684, + "learning_rate": 1e-05, + "loss": 0.0309, + "num_tokens": 298230699.0, + "reward": 0.25, + "reward_std": 0.19438526034355164, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999086856842041, + "sampling/importance_sampling_ratio/min": 0.0046700225211679935, + "sampling/sampling_logp_difference/max": 5.366591453552246, + "sampling/sampling_logp_difference/mean": 0.020166225731372833, + "step": 362 + }, + { + "clip_ratio/high_max": 6.953715910640312e-06, + "clip_ratio/high_mean": 1.738428977660078e-06, + "clip_ratio/low_mean": 2.961834002235264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1356769113699556e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6875.3125, + "completions/mean_terminated_length": 6647.1044921875, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8582051023840904, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021944146137684584, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 299131579.0, + "reward": 0.4375, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 5.424213668447919e-06, + "sampling/sampling_logp_difference/max": 12.124637603759766, + "sampling/sampling_logp_difference/mean": 0.018997181206941605, + "step": 363 + }, + { + "clip_ratio/high_max": 1.4359977967615123e-05, + "clip_ratio/high_mean": 5.290952628911327e-06, + "clip_ratio/low_mean": 1.991117466104697e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5202126892054366e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16093.0, + "completions/mean_length": 7046.46875, + "completions/mean_terminated_length": 6745.2578125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8899112716317177, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021380677353590727, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 300051471.0, + "reward": 0.390625, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000321865081787, + "sampling/importance_sampling_ratio/min": 0.00043609709246084094, + "sampling/sampling_logp_difference/max": 7.737645626068115, + "sampling/sampling_logp_difference/mean": 0.018849756568670273, + "step": 364 + }, + { + "clip_ratio/high_max": 1.1736750366253546e-05, + "clip_ratio/high_mean": 2.9341875915633864e-06, + "clip_ratio/low_mean": 2.6090394442235265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.902458214748549e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14683.0, + "completions/mean_length": 7227.8203125, + "completions/mean_terminated_length": 7008.072265625, + "completions/min_length": 869.0, + "completions/min_terminated_length": 869.0, + "entropy": 0.9667621031403542, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001994286896660924, + "learning_rate": 1e-05, + "loss": 0.0231, + "num_tokens": 300994584.0, + "reward": 0.4296875, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000085830688477, + "sampling/importance_sampling_ratio/min": 0.005131956655532122, + "sampling/sampling_logp_difference/max": 5.272268295288086, + "sampling/sampling_logp_difference/mean": 0.019861025735735893, + "step": 365 + }, + { + "clip_ratio/high_max": 5.608902483800193e-06, + "clip_ratio/high_mean": 1.4022256209500483e-06, + "clip_ratio/low_mean": 1.2587312312462018e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3989537819725228e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 6763.484375, + "completions/mean_terminated_length": 6372.40625, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.9238758087158203, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019569231662899256, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 301878446.0, + "reward": 0.4765625, + "reward_std": 0.2664504647254944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999585151672363, + "sampling/importance_sampling_ratio/min": 6.425123189046644e-08, + "sampling/sampling_logp_difference/max": 16.56046485900879, + "sampling/sampling_logp_difference/mean": 0.019518161192536354, + "step": 366 + }, + { + "clip_ratio/high_max": 4.044129582325695e-06, + "clip_ratio/high_mean": 1.0110323955814238e-06, + "clip_ratio/low_mean": 3.2966671312806284e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3977703822074545e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16018.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 6098.703125, + "completions/mean_terminated_length": 6098.703125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.7785998061299324, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024868762120604515, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 302677272.0, + "reward": 0.4921875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999961853027344, + "sampling/importance_sampling_ratio/min": 0.003617732785642147, + "sampling/sampling_logp_difference/max": 5.621907711029053, + "sampling/sampling_logp_difference/mean": 0.017242450267076492, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.291554517341865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.291554517341865e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15935.0, + "completions/mean_length": 6799.1875, + "completions/mean_terminated_length": 6569.15234375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.8998014703392982, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0017842436209321022, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 303565408.0, + "reward": 0.3046875, + "reward_std": 0.17806214094161987, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 0.002333547454327345, + "sampling/sampling_logp_difference/max": 6.060365676879883, + "sampling/sampling_logp_difference/mean": 0.01987488754093647, + "step": 368 + }, + { + "clip_ratio/high_max": 2.6103274649358355e-05, + "clip_ratio/high_mean": 7.854475143176387e-06, + "clip_ratio/low_mean": 5.6201750339823775e-05, + "clip_ratio/low_min": 6.543817562487675e-06, + "clip_ratio/region_mean": 6.405622525562649e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15338.0, + "completions/mean_length": 5483.4140625, + "completions/mean_terminated_length": 5131.7822265625, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "entropy": 0.8604720532894135, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004101207479834557, + "learning_rate": 1e-05, + "loss": 0.083, + "num_tokens": 304283925.0, + "reward": 0.4375, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999923825263977, + "sampling/importance_sampling_ratio/min": 8.628804062027484e-05, + "sampling/sampling_logp_difference/max": 9.357819557189941, + "sampling/sampling_logp_difference/mean": 0.018733445554971695, + "step": 369 + }, + { + "clip_ratio/high_max": 8.375103107027826e-06, + "clip_ratio/high_mean": 2.0937757767569565e-06, + "clip_ratio/low_mean": 4.883176779912901e-05, + "clip_ratio/low_min": 7.539494390584878e-06, + "clip_ratio/region_mean": 5.092554329166887e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7857.9140625, + "completions/mean_terminated_length": 7722.57958984375, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "entropy": 0.9493537694215775, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025712712667882442, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 305311730.0, + "reward": 0.3125, + "reward_std": 0.3227166533470154, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322295188904, + "sampling/importance_sampling_ratio/min": 0.00010902724170591682, + "sampling/sampling_logp_difference/max": 9.123912811279297, + "sampling/sampling_logp_difference/mean": 0.020730353891849518, + "step": 370 + }, + { + "clip_ratio/high_max": 1.7927761746250326e-05, + "clip_ratio/high_mean": 4.4819404365625815e-06, + "clip_ratio/low_mean": 1.4648778403625329e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.913071884018791e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14578.0, + "completions/mean_length": 6591.28125, + "completions/mean_terminated_length": 6514.17333984375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.8540837243199348, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001778970006853342, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 306172870.0, + "reward": 0.53125, + "reward_std": 0.25855979323387146, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999608397483826, + "sampling/importance_sampling_ratio/min": 0.005589231848716736, + "sampling/sampling_logp_difference/max": 5.18691349029541, + "sampling/sampling_logp_difference/mean": 0.018087508156895638, + "step": 371 + }, + { + "clip_ratio/high_max": 1.5696539094278705e-05, + "clip_ratio/high_mean": 3.924134773569676e-06, + "clip_ratio/low_mean": 4.2228432448609965e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.615256762008357e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 7443.5859375, + "completions/mean_terminated_length": 7301.6748046875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 1.1251945495605469, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024547462817281485, + "learning_rate": 1e-05, + "loss": -0.0017, + "num_tokens": 307145857.0, + "reward": 0.2734375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.0008770838030613959, + "sampling/sampling_logp_difference/max": 7.038908004760742, + "sampling/sampling_logp_difference/mean": 0.021768298000097275, + "step": 372 + }, + { + "clip_ratio/high_max": 7.035515409370419e-06, + "clip_ratio/high_mean": 1.7588788523426047e-06, + "clip_ratio/low_mean": 2.2691801063956518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4450679802612285e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14811.0, + "completions/max_terminated_length": 14811.0, + "completions/mean_length": 6497.890625, + "completions/mean_terminated_length": 6497.890625, + "completions/min_length": 1079.0, + "completions/min_terminated_length": 1079.0, + "entropy": 1.0804385766386986, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003075090004131198, + "learning_rate": 1e-05, + "loss": 0.012, + "num_tokens": 307998003.0, + "reward": 0.3515625, + "reward_std": 0.20753081142902374, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999311566352844, + "sampling/importance_sampling_ratio/min": 0.0032886455301195383, + "sampling/sampling_logp_difference/max": 5.717279434204102, + "sampling/sampling_logp_difference/mean": 0.021208221092820168, + "step": 373 + }, + { + "clip_ratio/high_max": 1.0550694696576102e-05, + "clip_ratio/high_mean": 3.640079512479133e-06, + "clip_ratio/low_mean": 3.440372779550671e-05, + "clip_ratio/low_min": 4.334107870818116e-06, + "clip_ratio/region_mean": 3.804380708061217e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16155.0, + "completions/mean_length": 7146.578125, + "completions/mean_terminated_length": 6692.2783203125, + "completions/min_length": 1089.0, + "completions/min_terminated_length": 1089.0, + "entropy": 0.900071032345295, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023383013904094696, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 308930389.0, + "reward": 0.453125, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000137090682983, + "sampling/importance_sampling_ratio/min": 0.003526465967297554, + "sampling/sampling_logp_difference/max": 5.647459030151367, + "sampling/sampling_logp_difference/mean": 0.019267898052930832, + "step": 374 + }, + { + "clip_ratio/high_max": 2.1745769345216104e-05, + "clip_ratio/high_mean": 6.434876752337004e-06, + "clip_ratio/low_mean": 3.9315604908551904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5750481831419165e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14293.0, + "completions/mean_length": 6189.109375, + "completions/mean_terminated_length": 6108.83447265625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.9284940734505653, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018437084509059787, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 309741419.0, + "reward": 0.4296875, + "reward_std": 0.3050953149795532, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000801086425781, + "sampling/importance_sampling_ratio/min": 4.7444238589378074e-05, + "sampling/sampling_logp_difference/max": 9.955955505371094, + "sampling/sampling_logp_difference/mean": 0.019703445956110954, + "step": 375 + }, + { + "clip_ratio/high_max": 1.630432370802737e-05, + "clip_ratio/high_mean": 4.076080927006842e-06, + "clip_ratio/low_mean": 3.713273554240004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1208816355720046e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15556.0, + "completions/mean_length": 5456.7421875, + "completions/mean_terminated_length": 5194.48828125, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.9236080572009087, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030215675942599773, + "learning_rate": 1e-05, + "loss": 0.0431, + "num_tokens": 310458386.0, + "reward": 0.46875, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 0.00015846964379306883, + "sampling/sampling_logp_difference/max": 8.749947547912598, + "sampling/sampling_logp_difference/mean": 0.01910843700170517, + "step": 376 + }, + { + "clip_ratio/high_max": 2.3289825548999943e-05, + "clip_ratio/high_mean": 5.822456387249986e-06, + "clip_ratio/low_mean": 3.062871041947801e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.645116612460697e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15118.0, + "completions/mean_length": 6246.25, + "completions/mean_terminated_length": 6085.33349609375, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 1.0128052979707718, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002812379039824009, + "learning_rate": 1e-05, + "loss": 0.0117, + "num_tokens": 311279114.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999204277992249, + "sampling/importance_sampling_ratio/min": 0.0007136549684219062, + "sampling/sampling_logp_difference/max": 7.245110988616943, + "sampling/sampling_logp_difference/mean": 0.02073795720934868, + "step": 377 + }, + { + "clip_ratio/high_max": 1.566006790199026e-05, + "clip_ratio/high_mean": 3.915016975497565e-06, + "clip_ratio/low_mean": 1.4384278813395213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.829929567520594e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15893.0, + "completions/mean_length": 7661.859375, + "completions/mean_terminated_length": 7452.5283203125, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 0.9746306762099266, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018165848450735211, + "learning_rate": 1e-05, + "loss": 0.0255, + "num_tokens": 312280648.0, + "reward": 0.3984375, + "reward_std": 0.15991678833961487, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999211430549622, + "sampling/importance_sampling_ratio/min": 2.2834767150925472e-05, + "sampling/sampling_logp_difference/max": 10.687226295471191, + "sampling/sampling_logp_difference/mean": 0.02064785361289978, + "step": 378 + }, + { + "clip_ratio/high_max": 6.112351002229843e-06, + "clip_ratio/high_mean": 1.5280877505574608e-06, + "clip_ratio/low_mean": 1.7822256495492184e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9350344246049644e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15283.0, + "completions/mean_length": 6575.921875, + "completions/mean_terminated_length": 6498.69287109375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 1.0576276555657387, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0009623004007153213, + "learning_rate": 1e-05, + "loss": -0.0131, + "num_tokens": 313142142.0, + "reward": 0.296875, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999088048934937, + "sampling/importance_sampling_ratio/min": 0.00010695109085645527, + "sampling/sampling_logp_difference/max": 9.143138885498047, + "sampling/sampling_logp_difference/mean": 0.02001393586397171, + "step": 379 + }, + { + "clip_ratio/high_max": 2.1532956907321932e-05, + "clip_ratio/high_mean": 7.117228278730181e-06, + "clip_ratio/low_mean": 4.647828791348729e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359551732908585e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16045.0, + "completions/mean_length": 7349.8203125, + "completions/mean_terminated_length": 7133.00048828125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.9633770063519478, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016735537210479379, + "learning_rate": 1e-05, + "loss": 0.0769, + "num_tokens": 314106551.0, + "reward": 0.3125, + "reward_std": 0.27670514583587646, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0006543444469571114, + "sampling/sampling_logp_difference/max": 7.331876754760742, + "sampling/sampling_logp_difference/mean": 0.01907072216272354, + "step": 380 + }, + { + "clip_ratio/high_max": 1.9804372868748032e-05, + "clip_ratio/high_mean": 4.951093217187008e-06, + "clip_ratio/low_mean": 2.807680073146912e-05, + "clip_ratio/low_min": 3.144654101561173e-06, + "clip_ratio/region_mean": 3.302789434656006e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16343.0, + "completions/mean_length": 7472.6640625, + "completions/mean_terminated_length": 7402.49609375, + "completions/min_length": 942.0, + "completions/min_terminated_length": 942.0, + "entropy": 1.0234674662351608, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0029567319434136152, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 315081020.0, + "reward": 0.328125, + "reward_std": 0.1841355264186859, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999366998672485, + "sampling/importance_sampling_ratio/min": 1.3551310985349119e-05, + "sampling/sampling_logp_difference/max": 11.209027290344238, + "sampling/sampling_logp_difference/mean": 0.020730063319206238, + "step": 381 + }, + { + "clip_ratio/high_max": 2.2943146859688568e-05, + "clip_ratio/high_mean": 6.9194542788864055e-06, + "clip_ratio/low_mean": 3.046788117444521e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.738733437330666e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16302.0, + "completions/mean_length": 7663.28125, + "completions/mean_terminated_length": 7234.39306640625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.989475853741169, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002559094922617078, + "learning_rate": 1e-05, + "loss": 0.002, + "num_tokens": 316083520.0, + "reward": 0.2890625, + "reward_std": 0.3227117359638214, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 0.003966364543884993, + "sampling/sampling_logp_difference/max": 5.529905319213867, + "sampling/sampling_logp_difference/mean": 0.02191789261996746, + "step": 382 + }, + { + "clip_ratio/high_max": 1.007244372885907e-05, + "clip_ratio/high_mean": 2.5181109322147677e-06, + "clip_ratio/low_mean": 4.157553627237576e-05, + "clip_ratio/low_min": 7.249949248944176e-06, + "clip_ratio/region_mean": 4.4093647659337876e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15227.0, + "completions/mean_length": 6828.703125, + "completions/mean_terminated_length": 6440.2763671875, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "entropy": 0.9493783265352249, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001576121780090034, + "learning_rate": 1e-05, + "loss": 0.0414, + "num_tokens": 316982154.0, + "reward": 0.4375, + "reward_std": 0.25726157426834106, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999561309814453, + "sampling/importance_sampling_ratio/min": 0.002232425380498171, + "sampling/sampling_logp_difference/max": 6.104666709899902, + "sampling/sampling_logp_difference/mean": 0.020356670022010803, + "step": 383 + }, + { + "clip_ratio/high_max": 4.308265033614589e-06, + "clip_ratio/high_mean": 1.0770662584036472e-06, + "clip_ratio/low_mean": 3.2841844813447096e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.391891118553758e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15194.0, + "completions/mean_length": 6555.2890625, + "completions/mean_terminated_length": 5986.685546875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9516563713550568, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002562758279964328, + "learning_rate": 1e-05, + "loss": -0.0459, + "num_tokens": 317841415.0, + "reward": 0.2734375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999120831489563, + "sampling/importance_sampling_ratio/min": 5.153654274181463e-05, + "sampling/sampling_logp_difference/max": 9.87321949005127, + "sampling/sampling_logp_difference/mean": 0.019885078072547913, + "step": 384 + }, + { + "clip_ratio/high_max": 1.579595573275583e-05, + "clip_ratio/high_mean": 3.948988933188957e-06, + "clip_ratio/low_mean": 5.6516228141845204e-05, + "clip_ratio/low_min": 1.2799536079910467e-05, + "clip_ratio/region_mean": 6.046521548341843e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 8033.5625, + "completions/mean_terminated_length": 7764.193359375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 1.0841791555285454, + "epoch": 0.35418583256669733, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0015623728977516294, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 318892079.0, + "reward": 0.234375, + "reward_std": 0.26249873638153076, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.0027189957909286022, + "sampling/sampling_logp_difference/max": 5.907492637634277, + "sampling/sampling_logp_difference/mean": 0.022173013538122177, + "step": 385 + }, + { + "clip_ratio/high_max": 1.592646640347084e-05, + "clip_ratio/high_mean": 3.98161660086771e-06, + "clip_ratio/low_mean": 3.5816001627608784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.979761731898179e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6105.0390625, + "completions/mean_terminated_length": 6024.1025390625, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.7882698476314545, + "epoch": 0.35510579576816925, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015339057426899672, + "learning_rate": 1e-05, + "loss": 0.0568, + "num_tokens": 319692740.0, + "reward": 0.5625, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999640583992004, + "sampling/importance_sampling_ratio/min": 0.005946483928710222, + "sampling/sampling_logp_difference/max": 5.124955177307129, + "sampling/sampling_logp_difference/mean": 0.017854198813438416, + "step": 386 + }, + { + "clip_ratio/high_max": 3.630976607382763e-06, + "clip_ratio/high_mean": 9.077441518456908e-07, + "clip_ratio/low_mean": 2.5168051195123553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6075795346969244e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14242.0, + "completions/max_terminated_length": 14242.0, + "completions/mean_length": 7078.359375, + "completions/mean_terminated_length": 7078.359375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 1.0915816724300385, + "epoch": 0.3560257589696412, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.000674036389682442, + "learning_rate": 1e-05, + "loss": 0.0477, + "num_tokens": 320618618.0, + "reward": 0.375, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999241828918457, + "sampling/importance_sampling_ratio/min": 0.012588412500917912, + "sampling/sampling_logp_difference/max": 4.374978542327881, + "sampling/sampling_logp_difference/mean": 0.021491196006536484, + "step": 387 + }, + { + "clip_ratio/high_max": 2.3060737021296518e-05, + "clip_ratio/high_mean": 8.880587984094745e-06, + "clip_ratio/low_mean": 4.042122702685447e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930181512463605e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15486.0, + "completions/mean_length": 7647.6875, + "completions/mean_terminated_length": 7065.26708984375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.8284596502780914, + "epoch": 0.35694572217111314, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001767225214280188, + "learning_rate": 1e-05, + "loss": 0.0847, + "num_tokens": 321617138.0, + "reward": 0.4765625, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999255537986755, + "sampling/importance_sampling_ratio/min": 0.0026657104026526213, + "sampling/sampling_logp_difference/max": 5.9272847175598145, + "sampling/sampling_logp_difference/mean": 0.018413839861750603, + "step": 388 + }, + { + "clip_ratio/high_max": 9.76903538685292e-06, + "clip_ratio/high_mean": 3.700462343658728e-06, + "clip_ratio/low_mean": 2.6322781820908858e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0023243880350492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14298.0, + "completions/mean_length": 6616.8984375, + "completions/mean_terminated_length": 6461.865234375, + "completions/min_length": 981.0, + "completions/min_terminated_length": 981.0, + "entropy": 0.9324140176177025, + "epoch": 0.3578656853725851, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007780150044709444, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 322482213.0, + "reward": 0.5078125, + "reward_std": 0.19332444667816162, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999249577522278, + "sampling/importance_sampling_ratio/min": 8.851349093674798e-07, + "sampling/sampling_logp_difference/max": 13.937525749206543, + "sampling/sampling_logp_difference/mean": 0.019632574170827866, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.183885348154945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.183885348154945e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15911.0, + "completions/mean_length": 6376.375, + "completions/mean_terminated_length": 6297.57470703125, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 1.0122736915946007, + "epoch": 0.35878564857405704, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.00017013182514347136, + "learning_rate": 1e-05, + "loss": 0.0068, + "num_tokens": 323316413.0, + "reward": 0.484375, + "reward_std": 0.1173202246427536, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999897480010986, + "sampling/importance_sampling_ratio/min": 0.001820300007238984, + "sampling/sampling_logp_difference/max": 6.308753967285156, + "sampling/sampling_logp_difference/mean": 0.020268389955163002, + "step": 390 + }, + { + "clip_ratio/high_max": 1.2158910067228135e-05, + "clip_ratio/high_mean": 4.907883408122871e-06, + "clip_ratio/low_mean": 3.3955970252463885e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.886385343321308e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 7434.703125, + "completions/mean_terminated_length": 7364.236328125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 1.056224174797535, + "epoch": 0.35970561177552896, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019504460506141186, + "learning_rate": 1e-05, + "loss": 0.0176, + "num_tokens": 324289663.0, + "reward": 0.3046875, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999295473098755, + "sampling/importance_sampling_ratio/min": 0.0005411410820670426, + "sampling/sampling_logp_difference/max": 7.5218305587768555, + "sampling/sampling_logp_difference/mean": 0.021627606824040413, + "step": 391 + }, + { + "clip_ratio/high_max": 2.5075807570829056e-05, + "clip_ratio/high_mean": 7.3508283549017506e-06, + "clip_ratio/low_mean": 3.88432285944873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.619405763151008e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 6783.9140625, + "completions/mean_terminated_length": 6708.32275390625, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.9994921758770943, + "epoch": 0.36062557497700093, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003350428305566311, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 325174860.0, + "reward": 0.40625, + "reward_std": 0.33797895908355713, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999490976333618, + "sampling/importance_sampling_ratio/min": 0.0019297851249575615, + "sampling/sampling_logp_difference/max": 6.250346660614014, + "sampling/sampling_logp_difference/mean": 0.02060745656490326, + "step": 392 + }, + { + "clip_ratio/high_max": 5.086883902549744e-06, + "clip_ratio/high_mean": 2.125662831531372e-06, + "clip_ratio/low_mean": 3.603865525292349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816431808445486e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15520.0, + "completions/mean_length": 6797.28125, + "completions/mean_terminated_length": 6645.111328125, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 0.9564928039908409, + "epoch": 0.36154553817847285, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030228395480662584, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 326065824.0, + "reward": 0.46875, + "reward_std": 0.27722427248954773, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999678134918213, + "sampling/importance_sampling_ratio/min": 1.927352604980115e-05, + "sampling/sampling_logp_difference/max": 10.856778144836426, + "sampling/sampling_logp_difference/mean": 0.020122073590755463, + "step": 393 + }, + { + "clip_ratio/high_max": 8.678096946823644e-06, + "clip_ratio/high_mean": 2.169524236705911e-06, + "clip_ratio/low_mean": 2.1449313862831332e-05, + "clip_ratio/low_min": 3.5140985801263014e-06, + "clip_ratio/region_mean": 2.361883775847673e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7028.4765625, + "completions/mean_terminated_length": 6954.81103515625, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "entropy": 0.9178477674722672, + "epoch": 0.3624655013799448, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027565474156290293, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 326985805.0, + "reward": 0.40625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 0.003095855936408043, + "sampling/sampling_logp_difference/max": 5.777690887451172, + "sampling/sampling_logp_difference/mean": 0.019194945693016052, + "step": 394 + }, + { + "clip_ratio/high_max": 1.1162969258293742e-05, + "clip_ratio/high_mean": 2.7907423145734356e-06, + "clip_ratio/low_mean": 4.0257837554236175e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.304857930037542e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15646.0, + "completions/mean_length": 6254.71875, + "completions/mean_terminated_length": 6174.96044921875, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.9090404361486435, + "epoch": 0.36338546458141674, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022540187928825617, + "learning_rate": 1e-05, + "loss": 0.0586, + "num_tokens": 327805417.0, + "reward": 0.4140625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999850392341614, + "sampling/importance_sampling_ratio/min": 0.007726692594587803, + "sampling/sampling_logp_difference/max": 4.86307430267334, + "sampling/sampling_logp_difference/mean": 0.01917862705886364, + "step": 395 + }, + { + "clip_ratio/high_max": 2.4049867988651386e-05, + "clip_ratio/high_mean": 6.012466997162846e-06, + "clip_ratio/low_mean": 2.1124733166288934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7137200504512293e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16051.0, + "completions/mean_length": 7654.546875, + "completions/mean_terminated_length": 7225.22900390625, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9535491093993187, + "epoch": 0.36430542778288866, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013819639571011066, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 328804303.0, + "reward": 0.5078125, + "reward_std": 0.2301519513130188, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999759793281555, + "sampling/importance_sampling_ratio/min": 0.00017957323871087283, + "sampling/sampling_logp_difference/max": 8.624927520751953, + "sampling/sampling_logp_difference/mean": 0.019935712218284607, + "step": 396 + }, + { + "clip_ratio/high_max": 4.677968718169723e-06, + "clip_ratio/high_mean": 1.1694921795424307e-06, + "clip_ratio/low_mean": 4.5318136926653096e-05, + "clip_ratio/low_min": 1.0762409146991558e-05, + "clip_ratio/region_mean": 4.648762910619553e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 6929.859375, + "completions/mean_terminated_length": 6702.96044921875, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.8612276986241341, + "epoch": 0.36522539098436063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015145445941016078, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 329711437.0, + "reward": 0.4375, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998670220375061, + "sampling/importance_sampling_ratio/min": 6.962344286876032e-06, + "sampling/sampling_logp_difference/max": 11.874994277954102, + "sampling/sampling_logp_difference/mean": 0.01896081678569317, + "step": 397 + }, + { + "clip_ratio/high_max": 1.5800192159076687e-05, + "clip_ratio/high_mean": 5.8905598052660935e-06, + "clip_ratio/low_mean": 1.027900856342967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.616956859606944e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15568.0, + "completions/mean_length": 6751.09375, + "completions/mean_terminated_length": 6675.244140625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 1.008638858795166, + "epoch": 0.36614535418583255, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010175694478675723, + "learning_rate": 1e-05, + "loss": -0.0079, + "num_tokens": 330594657.0, + "reward": 0.40625, + "reward_std": 0.17017142474651337, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999219179153442, + "sampling/importance_sampling_ratio/min": 6.605670205317438e-05, + "sampling/sampling_logp_difference/max": 9.62499713897705, + "sampling/sampling_logp_difference/mean": 0.019827818498015404, + "step": 398 + }, + { + "clip_ratio/high_max": 7.255490572788403e-06, + "clip_ratio/high_mean": 1.8138726431971008e-06, + "clip_ratio/low_mean": 4.20189051055786e-05, + "clip_ratio/low_min": 7.900641321612056e-06, + "clip_ratio/region_mean": 4.383277814667963e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16280.0, + "completions/mean_length": 7907.796875, + "completions/mean_terminated_length": 7563.2353515625, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.8603325337171555, + "epoch": 0.3670653173873045, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014811329310759902, + "learning_rate": 1e-05, + "loss": 0.0714, + "num_tokens": 331626943.0, + "reward": 0.28125, + "reward_std": 0.2161829173564911, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998874068260193, + "sampling/importance_sampling_ratio/min": 3.0665268013763125e-07, + "sampling/sampling_logp_difference/max": 14.997550010681152, + "sampling/sampling_logp_difference/mean": 0.018387217074632645, + "step": 399 + }, + { + "clip_ratio/high_max": 1.2884957641290384e-05, + "clip_ratio/high_mean": 4.083570104285172e-06, + "clip_ratio/low_mean": 1.6143149423442082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.022671930035358e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 7498.40625, + "completions/mean_terminated_length": 7137.203125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 1.0180751085281372, + "epoch": 0.36798528058877644, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001668943208642304, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 332605987.0, + "reward": 0.3359375, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725222587585, + "sampling/importance_sampling_ratio/min": 3.239733814552892e-08, + "sampling/sampling_logp_difference/max": 17.245189666748047, + "sampling/sampling_logp_difference/mean": 0.020663965493440628, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8121567652306112e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8121567652306112e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6650.4453125, + "completions/mean_terminated_length": 6495.94482421875, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 0.9293805658817291, + "epoch": 0.3689052437902484, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0036925526801496744, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333475324.0, + "reward": 0.3828125, + "reward_std": 0.19674427807331085, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999297857284546, + "sampling/importance_sampling_ratio/min": 0.0019147126004099846, + "sampling/sampling_logp_difference/max": 6.258187770843506, + "sampling/sampling_logp_difference/mean": 0.01987956464290619, + "step": 401 + }, + { + "clip_ratio/high_max": 9.03130421647802e-06, + "clip_ratio/high_mean": 2.257826054119505e-06, + "clip_ratio/low_mean": 3.9613908143110166e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.187173419722967e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14410.0, + "completions/mean_length": 6967.6328125, + "completions/mean_terminated_length": 6663.87890625, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "entropy": 0.8103456348180771, + "epoch": 0.36982520699172033, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015530216041952372, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 334389053.0, + "reward": 0.4765625, + "reward_std": 0.29932138323783875, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999937415122986, + "sampling/importance_sampling_ratio/min": 1.2903526112495456e-05, + "sampling/sampling_logp_difference/max": 11.258009910583496, + "sampling/sampling_logp_difference/mean": 0.018520750105381012, + "step": 402 + }, + { + "clip_ratio/high_max": 7.21459082342335e-06, + "clip_ratio/high_mean": 1.8036477058558376e-06, + "clip_ratio/low_mean": 2.5680752742118784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7484400334287784e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15788.0, + "completions/mean_length": 6583.15625, + "completions/mean_terminated_length": 6427.587890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 1.0669879838824272, + "epoch": 0.37074517019319225, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023163470905274153, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 335249113.0, + "reward": 0.3671875, + "reward_std": 0.2867175340652466, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999435544013977, + "sampling/importance_sampling_ratio/min": 0.0013276290846988559, + "sampling/sampling_logp_difference/max": 6.62436056137085, + "sampling/sampling_logp_difference/mean": 0.020729750394821167, + "step": 403 + }, + { + "clip_ratio/high_max": 1.915673669827811e-05, + "clip_ratio/high_mean": 4.789184174569527e-06, + "clip_ratio/low_mean": 4.268036605026282e-05, + "clip_ratio/low_min": 6.225874585652491e-06, + "clip_ratio/region_mean": 4.746955005430209e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 7847.734375, + "completions/mean_terminated_length": 7712.23876953125, + "completions/min_length": 1127.0, + "completions/min_terminated_length": 1127.0, + "entropy": 1.0450394004583359, + "epoch": 0.3716651333946642, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0011931186309084296, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 336270823.0, + "reward": 0.2734375, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000035047531128, + "sampling/importance_sampling_ratio/min": 0.004087730310857296, + "sampling/sampling_logp_difference/max": 5.499765396118164, + "sampling/sampling_logp_difference/mean": 0.02191723883152008, + "step": 404 + }, + { + "clip_ratio/high_max": 7.73082024352334e-06, + "clip_ratio/high_mean": 1.932705060880835e-06, + "clip_ratio/low_mean": 2.2936642153581488e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4869347271305742e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15621.0, + "completions/mean_length": 6286.1953125, + "completions/mean_terminated_length": 6206.68505859375, + "completions/min_length": 918.0, + "completions/min_terminated_length": 918.0, + "entropy": 1.0122173130512238, + "epoch": 0.37258509659613614, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032431832514703274, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 337095136.0, + "reward": 0.4453125, + "reward_std": 0.24275578558444977, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999330639839172, + "sampling/importance_sampling_ratio/min": 2.1024358431986911e-07, + "sampling/sampling_logp_difference/max": 15.374999046325684, + "sampling/sampling_logp_difference/mean": 0.021477293223142624, + "step": 405 + }, + { + "clip_ratio/high_max": 9.451312507735565e-06, + "clip_ratio/high_mean": 2.3628281269338913e-06, + "clip_ratio/low_mean": 1.8447401316734613e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.081022921629483e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15792.0, + "completions/max_terminated_length": 15792.0, + "completions/mean_length": 7430.8125, + "completions/mean_terminated_length": 7430.8125, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 1.1211064383387566, + "epoch": 0.3735050597976081, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012266195844858885, + "learning_rate": 1e-05, + "loss": 0.0132, + "num_tokens": 338069448.0, + "reward": 0.234375, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999444484710693, + "sampling/importance_sampling_ratio/min": 0.0013370488304644823, + "sampling/sampling_logp_difference/max": 6.617290496826172, + "sampling/sampling_logp_difference/mean": 0.02237049862742424, + "step": 406 + }, + { + "clip_ratio/high_max": 1.1666743375826627e-05, + "clip_ratio/high_mean": 2.9166858439566568e-06, + "clip_ratio/low_mean": 3.927663362901512e-05, + "clip_ratio/low_min": 4.591199740389129e-06, + "clip_ratio/region_mean": 4.2193319245598104e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15672.0, + "completions/max_terminated_length": 15672.0, + "completions/mean_length": 6209.578125, + "completions/mean_terminated_length": 6209.578125, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.9696918427944183, + "epoch": 0.37442502299908004, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002120936056599021, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 338883986.0, + "reward": 0.4921875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944806098938, + "sampling/importance_sampling_ratio/min": 0.000961031299084425, + "sampling/sampling_logp_difference/max": 6.947503566741943, + "sampling/sampling_logp_difference/mean": 0.0204964317381382, + "step": 407 + }, + { + "clip_ratio/high_max": 3.829187789960997e-06, + "clip_ratio/high_mean": 9.572969474902493e-07, + "clip_ratio/low_mean": 4.5606326921188156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.656362375499157e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15322.0, + "completions/max_terminated_length": 15322.0, + "completions/mean_length": 6625.140625, + "completions/mean_terminated_length": 6625.140625, + "completions/min_length": 1063.0, + "completions/min_terminated_length": 1063.0, + "entropy": 1.0780328214168549, + "epoch": 0.37534498620055196, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021016194950789213, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 339753228.0, + "reward": 0.359375, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 0.00479263998568058, + "sampling/sampling_logp_difference/max": 5.340673923492432, + "sampling/sampling_logp_difference/mean": 0.02143041603267193, + "step": 408 + }, + { + "clip_ratio/high_max": 1.7951345853362e-05, + "clip_ratio/high_mean": 4.4878364633405e-06, + "clip_ratio/low_mean": 3.357411151228007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8061947634560056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 7494.2109375, + "completions/mean_terminated_length": 7207.443359375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 1.0134501904249191, + "epoch": 0.37626494940202393, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017506639705970883, + "learning_rate": 1e-05, + "loss": 0.0361, + "num_tokens": 340731983.0, + "reward": 0.34375, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999791383743286, + "sampling/importance_sampling_ratio/min": 6.919008654904246e-08, + "sampling/sampling_logp_difference/max": 16.486408233642578, + "sampling/sampling_logp_difference/mean": 0.020142192021012306, + "step": 409 + }, + { + "clip_ratio/high_max": 2.0409703665791312e-05, + "clip_ratio/high_mean": 7.713539844189654e-06, + "clip_ratio/low_mean": 3.658559990071808e-05, + "clip_ratio/low_min": 3.80390133614128e-06, + "clip_ratio/region_mean": 4.429913997228141e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15238.0, + "completions/mean_length": 6724.828125, + "completions/mean_terminated_length": 6493.00830078125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.961749866604805, + "epoch": 0.37718491260349585, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014797865878790617, + "learning_rate": 1e-05, + "loss": -0.0195, + "num_tokens": 341613265.0, + "reward": 0.5, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999843835830688, + "sampling/importance_sampling_ratio/min": 1.6481149941682816e-05, + "sampling/sampling_logp_difference/max": 11.013293266296387, + "sampling/sampling_logp_difference/mean": 0.021053435280919075, + "step": 410 + }, + { + "clip_ratio/high_max": 8.271860679087695e-06, + "clip_ratio/high_mean": 2.0679651697719237e-06, + "clip_ratio/low_mean": 2.1166565488783817e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.323453065855574e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14961.0, + "completions/mean_length": 6513.5625, + "completions/mean_terminated_length": 6195.1611328125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.8742869198322296, + "epoch": 0.3781048758049678, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018223582301288843, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 342466337.0, + "reward": 0.5, + "reward_std": 0.20593318343162537, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690651893616, + "sampling/importance_sampling_ratio/min": 0.0027132700197398663, + "sampling/sampling_logp_difference/max": 5.909600734710693, + "sampling/sampling_logp_difference/mean": 0.01892159879207611, + "step": 411 + }, + { + "clip_ratio/high_max": 1.867416995082749e-05, + "clip_ratio/high_mean": 4.668542487706873e-06, + "clip_ratio/low_mean": 5.194308118916524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6611622540003737e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15859.0, + "completions/max_terminated_length": 15859.0, + "completions/mean_length": 7088.0390625, + "completions/mean_terminated_length": 7088.0390625, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "entropy": 0.8695354089140892, + "epoch": 0.37902483900643974, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00121080141980201, + "learning_rate": 1e-05, + "loss": 0.0095, + "num_tokens": 343393318.0, + "reward": 0.515625, + "reward_std": 0.3009189963340759, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.0003100235771853477, + "sampling/sampling_logp_difference/max": 8.078862190246582, + "sampling/sampling_logp_difference/mean": 0.01892455853521824, + "step": 412 + }, + { + "clip_ratio/high_max": 3.6179024164084694e-05, + "clip_ratio/high_mean": 9.044756041021174e-06, + "clip_ratio/low_mean": 3.288474886176118e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1929504845938936e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15178.0, + "completions/mean_length": 6221.6484375, + "completions/mean_terminated_length": 6141.6298828125, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.937163233757019, + "epoch": 0.37994480220791166, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002599990228191018, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 344207225.0, + "reward": 0.390625, + "reward_std": 0.348238468170166, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527335166931, + "sampling/importance_sampling_ratio/min": 3.535756695782766e-05, + "sampling/sampling_logp_difference/max": 10.249998092651367, + "sampling/sampling_logp_difference/mean": 0.019875720143318176, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.69036411534762e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.69036411534762e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6664.46875, + "completions/mean_terminated_length": 6587.93701171875, + "completions/min_length": 1317.0, + "completions/min_terminated_length": 1317.0, + "entropy": 1.0893034785985947, + "epoch": 0.38086476540938363, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012395181693136692, + "learning_rate": 1e-05, + "loss": 0.0358, + "num_tokens": 345082629.0, + "reward": 0.3984375, + "reward_std": 0.23145011067390442, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253153800964, + "sampling/importance_sampling_ratio/min": 0.0004444181395228952, + "sampling/sampling_logp_difference/max": 7.71874475479126, + "sampling/sampling_logp_difference/mean": 0.022249475121498108, + "step": 414 + }, + { + "clip_ratio/high_max": 3.8116729683679296e-06, + "clip_ratio/high_mean": 9.529182420919824e-07, + "clip_ratio/low_mean": 1.930760379309504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0260522319404117e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16020.0, + "completions/mean_length": 5986.390625, + "completions/mean_terminated_length": 5904.51953125, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9476369470357895, + "epoch": 0.38178472861085555, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011368105188012123, + "learning_rate": 1e-05, + "loss": 0.0414, + "num_tokens": 345869327.0, + "reward": 0.40625, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999204277992249, + "sampling/importance_sampling_ratio/min": 0.0007102401577867568, + "sampling/sampling_logp_difference/max": 7.249907493591309, + "sampling/sampling_logp_difference/mean": 0.019328134134411812, + "step": 415 + }, + { + "clip_ratio/high_max": 2.638578052938101e-06, + "clip_ratio/high_mean": 6.596445132345252e-07, + "clip_ratio/low_mean": 2.8019193905493012e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8678838418727537e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15247.0, + "completions/mean_length": 7780.8046875, + "completions/mean_terminated_length": 7574.328125, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.9548748508095741, + "epoch": 0.3827046918123275, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016439391765743494, + "learning_rate": 1e-05, + "loss": 0.0134, + "num_tokens": 346885974.0, + "reward": 0.3828125, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999086856842041, + "sampling/importance_sampling_ratio/min": 0.0041214353404939175, + "sampling/sampling_logp_difference/max": 5.491553783416748, + "sampling/sampling_logp_difference/mean": 0.020669173449277878, + "step": 416 + }, + { + "clip_ratio/high_max": 8.280869224108756e-06, + "clip_ratio/high_mean": 2.070217306027189e-06, + "clip_ratio/low_mean": 3.338867099955678e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5458888532957644e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15766.0, + "completions/mean_length": 7118.4921875, + "completions/mean_terminated_length": 6582.470703125, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "entropy": 0.9908356294035912, + "epoch": 0.38362465501379944, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002354196272790432, + "learning_rate": 1e-05, + "loss": 0.037, + "num_tokens": 347818245.0, + "reward": 0.421875, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998934268951416, + "sampling/importance_sampling_ratio/min": 7.691462087677792e-05, + "sampling/sampling_logp_difference/max": 9.472814559936523, + "sampling/sampling_logp_difference/mean": 0.020420750603079796, + "step": 417 + }, + { + "clip_ratio/high_max": 4.261557478457689e-06, + "clip_ratio/high_mean": 1.0653893696144223e-06, + "clip_ratio/low_mean": 3.0260198514042713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1325587883657136e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15213.0, + "completions/mean_length": 7016.0546875, + "completions/mean_terminated_length": 6791.22412109375, + "completions/min_length": 907.0, + "completions/min_terminated_length": 907.0, + "entropy": 0.9372202381491661, + "epoch": 0.3845446182152714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002695834031328559, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 348734852.0, + "reward": 0.484375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999836087226868, + "sampling/importance_sampling_ratio/min": 3.6898933331031003e-07, + "sampling/sampling_logp_difference/max": 14.812498092651367, + "sampling/sampling_logp_difference/mean": 0.01997985690832138, + "step": 418 + }, + { + "clip_ratio/high_max": 1.4203505088516977e-05, + "clip_ratio/high_mean": 4.557706688501639e-06, + "clip_ratio/low_mean": 3.802522951446008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.258293620296172e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15005.0, + "completions/max_terminated_length": 15005.0, + "completions/mean_length": 6170.859375, + "completions/mean_terminated_length": 6170.859375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.7692223712801933, + "epoch": 0.38546458141674333, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003598283976316452, + "learning_rate": 1e-05, + "loss": 0.0745, + "num_tokens": 349543850.0, + "reward": 0.625, + "reward_std": 0.37875327467918396, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.017690379172563553, + "step": 419 + }, + { + "clip_ratio/high_max": 3.7454306038853247e-06, + "clip_ratio/high_mean": 9.363576509713312e-07, + "clip_ratio/low_mean": 2.0118780639677425e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1055138290648756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14385.0, + "completions/mean_length": 6198.5859375, + "completions/mean_terminated_length": 6118.3857421875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 1.0641538202762604, + "epoch": 0.38638454461821525, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003362868446856737, + "learning_rate": 1e-05, + "loss": 0.0385, + "num_tokens": 350358493.0, + "reward": 0.4375, + "reward_std": 0.2432974874973297, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000051498413086, + "sampling/importance_sampling_ratio/min": 9.425564826415211e-07, + "sampling/sampling_logp_difference/max": 13.874670028686523, + "sampling/sampling_logp_difference/mean": 0.01945672184228897, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.529027955868514e-05, + "clip_ratio/low_min": 1.1817648100986844e-05, + "clip_ratio/region_mean": 4.529027955868514e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7429.953125, + "completions/mean_terminated_length": 6833.01708984375, + "completions/min_length": 1152.0, + "completions/min_terminated_length": 1152.0, + "entropy": 0.7885174229741096, + "epoch": 0.3873045078196872, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020358162000775337, + "learning_rate": 1e-05, + "loss": 0.0665, + "num_tokens": 351327135.0, + "reward": 0.3984375, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483823776245, + "sampling/importance_sampling_ratio/min": 4.07999541494064e-05, + "sampling/sampling_logp_difference/max": 10.106829643249512, + "sampling/sampling_logp_difference/mean": 0.017557526007294655, + "step": 421 + }, + { + "clip_ratio/high_max": 1.2953943951288238e-05, + "clip_ratio/high_mean": 4.294050768294255e-06, + "clip_ratio/low_mean": 2.7448330115475983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174238065639656e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16185.0, + "completions/max_terminated_length": 16185.0, + "completions/mean_length": 7466.75, + "completions/mean_terminated_length": 7466.75, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "entropy": 0.9798530638217926, + "epoch": 0.38822447102115915, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019255588995292783, + "learning_rate": 1e-05, + "loss": 0.0395, + "num_tokens": 352300247.0, + "reward": 0.265625, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999645352363586, + "sampling/importance_sampling_ratio/min": 0.0010790677042677999, + "sampling/sampling_logp_difference/max": 6.831657886505127, + "sampling/sampling_logp_difference/mean": 0.020764775574207306, + "step": 422 + }, + { + "clip_ratio/high_max": 1.4318582771011279e-05, + "clip_ratio/high_mean": 3.5796456927528197e-06, + "clip_ratio/low_mean": 1.4836090599601448e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8415736349197687e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16182.0, + "completions/mean_length": 6264.40625, + "completions/mean_terminated_length": 6021.5361328125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.8464985340833664, + "epoch": 0.3891444342226311, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016904048388823867, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 353122747.0, + "reward": 0.2890625, + "reward_std": 0.2738093435764313, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 1.569278902024962e-05, + "sampling/sampling_logp_difference/max": 11.062309265136719, + "sampling/sampling_logp_difference/mean": 0.018584076315164566, + "step": 423 + }, + { + "clip_ratio/high_max": 1.6524649709026562e-05, + "clip_ratio/high_mean": 5.198334406486538e-06, + "clip_ratio/low_mean": 5.1570618779805955e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.676895318629249e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16051.0, + "completions/max_terminated_length": 16051.0, + "completions/mean_length": 5848.3359375, + "completions/mean_terminated_length": 5848.3359375, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "entropy": 1.0793062299489975, + "epoch": 0.39006439742410304, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015838779509067535, + "learning_rate": 1e-05, + "loss": -0.0144, + "num_tokens": 353888374.0, + "reward": 0.4921875, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999014139175415, + "sampling/importance_sampling_ratio/min": 0.0002261155314045027, + "sampling/sampling_logp_difference/max": 8.394464492797852, + "sampling/sampling_logp_difference/mean": 0.020625369623303413, + "step": 424 + }, + { + "clip_ratio/high_max": 2.2546613308804808e-05, + "clip_ratio/high_mean": 5.636653327201202e-06, + "clip_ratio/low_mean": 4.848485787078971e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.048513922796701e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14583.0, + "completions/mean_length": 5917.984375, + "completions/mean_terminated_length": 5751.857421875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8621423915028572, + "epoch": 0.39098436062557496, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002542395843192935, + "learning_rate": 1e-05, + "loss": 0.053, + "num_tokens": 354665052.0, + "reward": 0.6484375, + "reward_std": 0.13941732048988342, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999158382415771, + "sampling/importance_sampling_ratio/min": 0.00038012932054698467, + "sampling/sampling_logp_difference/max": 7.874999046325684, + "sampling/sampling_logp_difference/mean": 0.0170799158513546, + "step": 425 + }, + { + "clip_ratio/high_max": 1.1686064681271091e-05, + "clip_ratio/high_mean": 2.9215161703177728e-06, + "clip_ratio/low_mean": 1.6330765674865688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9252282072557136e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 6513.65625, + "completions/mean_terminated_length": 6435.93701171875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0047430396080017, + "epoch": 0.39190432382704693, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0008743361104279757, + "learning_rate": 1e-05, + "loss": 0.0568, + "num_tokens": 355526744.0, + "reward": 0.3125, + "reward_std": 0.16097761690616608, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999683499336243, + "sampling/importance_sampling_ratio/min": 5.006812898500357e-06, + "sampling/sampling_logp_difference/max": 12.204710960388184, + "sampling/sampling_logp_difference/mean": 0.020237455144524574, + "step": 426 + }, + { + "clip_ratio/high_max": 1.7667963220446836e-05, + "clip_ratio/high_mean": 4.416990805111709e-06, + "clip_ratio/low_mean": 2.390649478911655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.832348559422826e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13700.0, + "completions/max_terminated_length": 13700.0, + "completions/mean_length": 6363.9375, + "completions/mean_terminated_length": 6363.9375, + "completions/min_length": 1118.0, + "completions/min_terminated_length": 1118.0, + "entropy": 0.910186342895031, + "epoch": 0.39282428702851885, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034290661569684744, + "learning_rate": 1e-05, + "loss": 0.0773, + "num_tokens": 356359920.0, + "reward": 0.4296875, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.0023352939169853926, + "sampling/sampling_logp_difference/max": 6.059617519378662, + "sampling/sampling_logp_difference/mean": 0.019128751009702682, + "step": 427 + }, + { + "clip_ratio/high_max": 1.9295963738841238e-05, + "clip_ratio/high_mean": 4.823990934710309e-06, + "clip_ratio/low_mean": 3.187764491485723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.67016357358807e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14673.0, + "completions/max_terminated_length": 14673.0, + "completions/mean_length": 6206.5859375, + "completions/mean_terminated_length": 6206.5859375, + "completions/min_length": 988.0, + "completions/min_terminated_length": 988.0, + "entropy": 0.8695667088031769, + "epoch": 0.3937442502299908, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022478618193417788, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 357172435.0, + "reward": 0.5390625, + "reward_std": 0.3332657814025879, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000020146369934, + "sampling/importance_sampling_ratio/min": 1.993246769416146e-06, + "sampling/sampling_logp_difference/max": 13.12574577331543, + "sampling/sampling_logp_difference/mean": 0.019101407378911972, + "step": 428 + }, + { + "clip_ratio/high_max": 2.577107125034672e-06, + "clip_ratio/high_mean": 6.44276781258668e-07, + "clip_ratio/low_mean": 3.719566507243144e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.783994179684669e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14648.0, + "completions/mean_length": 6773.65625, + "completions/mean_terminated_length": 6697.984375, + "completions/min_length": 1150.0, + "completions/min_terminated_length": 1150.0, + "entropy": 1.0704292133450508, + "epoch": 0.39466421343146274, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030995130073279142, + "learning_rate": 1e-05, + "loss": 0.0409, + "num_tokens": 358060623.0, + "reward": 0.3515625, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589323997498, + "sampling/importance_sampling_ratio/min": 1.8965129129355773e-05, + "sampling/sampling_logp_difference/max": 10.872908592224121, + "sampling/sampling_logp_difference/mean": 0.02080383338034153, + "step": 429 + }, + { + "clip_ratio/high_max": 1.0044732334790751e-05, + "clip_ratio/high_mean": 3.6204799016559264e-06, + "clip_ratio/low_mean": 3.683777390506293e-05, + "clip_ratio/low_min": 4.640285169443814e-06, + "clip_ratio/region_mean": 4.045825380671886e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6753.4609375, + "completions/mean_terminated_length": 6442.79833984375, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "entropy": 0.8907509669661522, + "epoch": 0.39558417663293466, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025211002212017775, + "learning_rate": 1e-05, + "loss": 0.0812, + "num_tokens": 358942514.0, + "reward": 0.5078125, + "reward_std": 0.33691808581352234, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.001427572569809854, + "sampling/sampling_logp_difference/max": 6.551779747009277, + "sampling/sampling_logp_difference/mean": 0.019076799973845482, + "step": 430 + }, + { + "clip_ratio/high_max": 2.213625748481718e-05, + "clip_ratio/high_mean": 5.534064371204295e-06, + "clip_ratio/low_mean": 4.042425916850334e-05, + "clip_ratio/low_min": 4.858519787376281e-06, + "clip_ratio/region_mean": 4.59583234260208e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16165.0, + "completions/max_terminated_length": 16165.0, + "completions/mean_length": 5878.4921875, + "completions/mean_terminated_length": 5878.4921875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.8234230354428291, + "epoch": 0.39650413983440663, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023358019534498453, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 359716041.0, + "reward": 0.53125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998354911804199, + "sampling/importance_sampling_ratio/min": 0.0008571944781579077, + "sampling/sampling_logp_difference/max": 7.061845779418945, + "sampling/sampling_logp_difference/mean": 0.018851958215236664, + "step": 431 + }, + { + "clip_ratio/high_max": 7.793237045916612e-06, + "clip_ratio/high_mean": 1.948309261479153e-06, + "clip_ratio/low_mean": 5.3089813718543155e-05, + "clip_ratio/low_min": 3.7982376852596644e-06, + "clip_ratio/region_mean": 5.503812303686573e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15028.0, + "completions/mean_length": 6296.0078125, + "completions/mean_terminated_length": 6135.88134765625, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.9341304004192352, + "epoch": 0.39742410303587855, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002632992109283805, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 360544066.0, + "reward": 0.390625, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999693036079407, + "sampling/importance_sampling_ratio/min": 0.00015875507961027324, + "sampling/sampling_logp_difference/max": 8.748147964477539, + "sampling/sampling_logp_difference/mean": 0.01882069557905197, + "step": 432 + }, + { + "clip_ratio/high_max": 1.8652748622116633e-05, + "clip_ratio/high_mean": 4.663187155529158e-06, + "clip_ratio/low_mean": 3.725770324081168e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1920890453184256e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15766.0, + "completions/mean_length": 7325.359375, + "completions/mean_terminated_length": 6957.12158203125, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "entropy": 0.7979409247636795, + "epoch": 0.3983440662373505, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002110559493303299, + "learning_rate": 1e-05, + "loss": 0.0474, + "num_tokens": 361502504.0, + "reward": 0.4921875, + "reward_std": 0.21436071395874023, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271631240845, + "sampling/importance_sampling_ratio/min": 1.778415753506124e-05, + "sampling/sampling_logp_difference/max": 10.937202453613281, + "sampling/sampling_logp_difference/mean": 0.018452363088726997, + "step": 433 + }, + { + "clip_ratio/high_max": 5.034029982198263e-06, + "clip_ratio/high_mean": 1.2585074955495656e-06, + "clip_ratio/low_mean": 2.1098365436955646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2356872932505212e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 5471.5625, + "completions/mean_terminated_length": 5385.6376953125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "entropy": 0.8691592514514923, + "epoch": 0.39926402943882244, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038794223219156265, + "learning_rate": 1e-05, + "loss": -0.041, + "num_tokens": 362220856.0, + "reward": 0.546875, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 0.0027285523246973753, + "sampling/sampling_logp_difference/max": 5.903984069824219, + "sampling/sampling_logp_difference/mean": 0.01814887300133705, + "step": 434 + }, + { + "clip_ratio/high_max": 1.2709096154139843e-05, + "clip_ratio/high_mean": 3.1772740385349607e-06, + "clip_ratio/low_mean": 4.124845816022571e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.442573271035144e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 5305.328125, + "completions/mean_terminated_length": 5218.09423828125, + "completions/min_length": 542.0, + "completions/min_terminated_length": 542.0, + "entropy": 0.7804318591952324, + "epoch": 0.40018399264029436, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029884849209338427, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 362921226.0, + "reward": 0.6328125, + "reward_std": 0.3505876660346985, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999871015548706, + "sampling/importance_sampling_ratio/min": 0.0024799995590001345, + "sampling/sampling_logp_difference/max": 5.999496936798096, + "sampling/sampling_logp_difference/mean": 0.017358118668198586, + "step": 435 + }, + { + "clip_ratio/high_max": 4.018904746772023e-06, + "clip_ratio/high_mean": 1.9869055449817097e-06, + "clip_ratio/low_mean": 3.535901299756006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.734591876991544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15577.0, + "completions/max_terminated_length": 15577.0, + "completions/mean_length": 7197.6328125, + "completions/mean_terminated_length": 7197.6328125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9466754496097565, + "epoch": 0.40110395584176634, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023567057214677334, + "learning_rate": 1e-05, + "loss": 0.1036, + "num_tokens": 363863579.0, + "reward": 0.375, + "reward_std": 0.2924865484237671, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 3.132333574740187e-07, + "sampling/sampling_logp_difference/max": 14.976317405700684, + "sampling/sampling_logp_difference/mean": 0.020331334322690964, + "step": 436 + }, + { + "clip_ratio/high_max": 3.7869606330787065e-06, + "clip_ratio/high_mean": 9.467401582696766e-07, + "clip_ratio/low_mean": 4.479868130147224e-05, + "clip_ratio/low_min": 5.061343472334556e-06, + "clip_ratio/region_mean": 4.57454214028985e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15503.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6058.7890625, + "completions/mean_terminated_length": 6058.7890625, + "completions/min_length": 732.0, + "completions/min_terminated_length": 732.0, + "entropy": 0.9345398098230362, + "epoch": 0.40202391904323825, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018098369473591447, + "learning_rate": 1e-05, + "loss": 0.1307, + "num_tokens": 364660120.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293684959412, + "sampling/importance_sampling_ratio/min": 0.004112724680453539, + "sampling/sampling_logp_difference/max": 5.493669509887695, + "sampling/sampling_logp_difference/mean": 0.019891154021024704, + "step": 437 + }, + { + "clip_ratio/high_max": 1.2886742979389965e-05, + "clip_ratio/high_mean": 3.221685744847491e-06, + "clip_ratio/low_mean": 4.962291495758109e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.284460121401935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16003.0, + "completions/mean_length": 6929.984375, + "completions/mean_terminated_length": 6625.01611328125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9930986166000366, + "epoch": 0.4029438822447102, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033301038201898336, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 365564662.0, + "reward": 0.3828125, + "reward_std": 0.30457618832588196, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.0009120093891397119, + "sampling/sampling_logp_difference/max": 6.9998602867126465, + "sampling/sampling_logp_difference/mean": 0.02060488425195217, + "step": 438 + }, + { + "clip_ratio/high_max": 1.3284722399475868e-05, + "clip_ratio/high_mean": 3.321180599868967e-06, + "clip_ratio/low_mean": 2.590538883850968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.922656926784839e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14903.0, + "completions/max_terminated_length": 14903.0, + "completions/mean_length": 6197.3671875, + "completions/mean_terminated_length": 6197.3671875, + "completions/min_length": 845.0, + "completions/min_terminated_length": 845.0, + "entropy": 0.9469878897070885, + "epoch": 0.40386384544618215, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003049476072192192, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 366379725.0, + "reward": 0.421875, + "reward_std": 0.3253750801086426, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999247789382935, + "sampling/importance_sampling_ratio/min": 0.0005533178336918354, + "sampling/sampling_logp_difference/max": 7.49957799911499, + "sampling/sampling_logp_difference/mean": 0.019666746258735657, + "step": 439 + }, + { + "clip_ratio/high_max": 1.4212190535545233e-05, + "clip_ratio/high_mean": 3.553047633886308e-06, + "clip_ratio/low_mean": 4.362488289189059e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7177931264741346e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15647.0, + "completions/mean_length": 6331.6015625, + "completions/mean_terminated_length": 6007.33056640625, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "entropy": 0.9937634319067001, + "epoch": 0.4047838086476541, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001529635745100677, + "learning_rate": 1e-05, + "loss": 0.0863, + "num_tokens": 367207994.0, + "reward": 0.3671875, + "reward_std": 0.2732901871204376, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998211860656738, + "sampling/importance_sampling_ratio/min": 0.0013787593925371766, + "sampling/sampling_logp_difference/max": 6.586571216583252, + "sampling/sampling_logp_difference/mean": 0.02042214572429657, + "step": 440 + }, + { + "clip_ratio/high_max": 1.3438677797239507e-05, + "clip_ratio/high_mean": 4.353689405434125e-06, + "clip_ratio/low_mean": 2.1308957457222277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5662646748969564e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14323.0, + "completions/mean_length": 6679.5, + "completions/mean_terminated_length": 6525.4609375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 1.034226231276989, + "epoch": 0.40570377184912604, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002576075494289398, + "learning_rate": 1e-05, + "loss": 0.0037, + "num_tokens": 368085602.0, + "reward": 0.4921875, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999170899391174, + "sampling/importance_sampling_ratio/min": 0.02749871462583542, + "sampling/sampling_logp_difference/max": 3.593616008758545, + "sampling/sampling_logp_difference/mean": 0.02129797264933586, + "step": 441 + }, + { + "clip_ratio/high_max": 1.2707126188615803e-05, + "clip_ratio/high_mean": 3.1767815471539507e-06, + "clip_ratio/low_mean": 5.362682486520498e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6803606184985256e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14425.0, + "completions/mean_length": 7171.984375, + "completions/mean_terminated_length": 6874.822265625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.994599312543869, + "epoch": 0.40662373505059796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003648000070825219, + "learning_rate": 1e-05, + "loss": 0.0468, + "num_tokens": 369021400.0, + "reward": 0.34375, + "reward_std": 0.3174794614315033, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 2.1446328901220113e-05, + "sampling/sampling_logp_difference/max": 10.749957084655762, + "sampling/sampling_logp_difference/mean": 0.02128203772008419, + "step": 442 + }, + { + "clip_ratio/high_max": 4.010523753095185e-06, + "clip_ratio/high_mean": 1.0026309382737963e-06, + "clip_ratio/low_mean": 5.049121273259516e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.149384355718212e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15812.0, + "completions/mean_length": 7633.953125, + "completions/mean_terminated_length": 7203.62255859375, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "entropy": 0.9781397357583046, + "epoch": 0.40754369825206993, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002637698082253337, + "learning_rate": 1e-05, + "loss": 0.1255, + "num_tokens": 370022274.0, + "reward": 0.3671875, + "reward_std": 0.3106446862220764, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527931213379, + "sampling/importance_sampling_ratio/min": 0.0006269909208640456, + "sampling/sampling_logp_difference/max": 7.374578475952148, + "sampling/sampling_logp_difference/mean": 0.02037280797958374, + "step": 443 + }, + { + "clip_ratio/high_max": 8.796280781098176e-06, + "clip_ratio/high_mean": 2.199070195274544e-06, + "clip_ratio/low_mean": 2.404907445452409e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6248144422424957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14346.0, + "completions/mean_length": 6966.890625, + "completions/mean_terminated_length": 6892.740234375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0748675763607025, + "epoch": 0.40846366145354185, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002537182765081525, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 370936076.0, + "reward": 0.421875, + "reward_std": 0.24329747259616852, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483823776245, + "sampling/importance_sampling_ratio/min": 0.001600474352017045, + "sampling/sampling_logp_difference/max": 6.437455177307129, + "sampling/sampling_logp_difference/mean": 0.0208933986723423, + "step": 444 + }, + { + "clip_ratio/high_max": 1.888703832264582e-05, + "clip_ratio/high_mean": 4.721759580661455e-06, + "clip_ratio/low_mean": 3.932560184694012e-05, + "clip_ratio/low_min": 3.3643752885836875e-06, + "clip_ratio/region_mean": 4.404736250762653e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16226.0, + "completions/mean_length": 7487.2890625, + "completions/mean_terminated_length": 7346.07177734375, + "completions/min_length": 792.0, + "completions/min_terminated_length": 792.0, + "entropy": 0.9402988106012344, + "epoch": 0.4093836246550138, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016896538436412811, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 371915793.0, + "reward": 0.3125, + "reward_std": 0.32849061489105225, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999174475669861, + "sampling/importance_sampling_ratio/min": 4.222911684337305e-06, + "sampling/sampling_logp_difference/max": 12.374985694885254, + "sampling/sampling_logp_difference/mean": 0.018897607922554016, + "step": 445 + }, + { + "clip_ratio/high_max": 1.2214306025271071e-05, + "clip_ratio/high_mean": 3.0535765063177678e-06, + "clip_ratio/low_mean": 1.0073189514514524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3126766020832292e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14307.0, + "completions/max_terminated_length": 14307.0, + "completions/mean_length": 5188.9375, + "completions/mean_terminated_length": 5188.9375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.8868530839681625, + "epoch": 0.41030358785648574, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001575644128024578, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 372605969.0, + "reward": 0.5390625, + "reward_std": 0.1938612163066864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999008774757385, + "sampling/importance_sampling_ratio/min": 0.0020112686324864626, + "sampling/sampling_logp_difference/max": 6.20898962020874, + "sampling/sampling_logp_difference/mean": 0.017719607800245285, + "step": 446 + }, + { + "clip_ratio/high_max": 1.6542175217182375e-05, + "clip_ratio/high_mean": 6.5401112578911125e-06, + "clip_ratio/low_mean": 3.020691053734481e-05, + "clip_ratio/low_min": 4.941101906297263e-06, + "clip_ratio/region_mean": 3.674702134048857e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 7290.9140625, + "completions/mean_terminated_length": 7146.57958984375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 1.06352149695158, + "epoch": 0.41122355105795766, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020332508720457554, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 373557094.0, + "reward": 0.40625, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998309016227722, + "sampling/importance_sampling_ratio/min": 8.97010977496393e-06, + "sampling/sampling_logp_difference/max": 11.621612548828125, + "sampling/sampling_logp_difference/mean": 0.022010326385498047, + "step": 447 + }, + { + "clip_ratio/high_max": 8.10710616860888e-06, + "clip_ratio/high_mean": 2.02677654215222e-06, + "clip_ratio/low_mean": 5.330761632649228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.5334393664452364e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15476.0, + "completions/mean_length": 6881.6640625, + "completions/mean_terminated_length": 6495.39013671875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.9094375595450401, + "epoch": 0.41214351425942963, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019624519627541304, + "learning_rate": 1e-05, + "loss": 0.0492, + "num_tokens": 374459827.0, + "reward": 0.4609375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 3.292101524721147e-08, + "sampling/sampling_logp_difference/max": 17.229154586791992, + "sampling/sampling_logp_difference/mean": 0.019491354003548622, + "step": 448 + }, + { + "clip_ratio/high_max": 2.0297283754189266e-05, + "clip_ratio/high_mean": 5.0743209385473165e-06, + "clip_ratio/low_mean": 3.7426975950438646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.250129745742015e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14792.0, + "completions/mean_length": 6641.203125, + "completions/mean_terminated_length": 6245.154296875, + "completions/min_length": 925.0, + "completions/min_terminated_length": 925.0, + "entropy": 0.7556380406022072, + "epoch": 0.41306347746090155, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0010716031538322568, + "learning_rate": 1e-05, + "loss": 0.1355, + "num_tokens": 375331749.0, + "reward": 0.625, + "reward_std": 0.34876543283462524, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000039339065552, + "sampling/importance_sampling_ratio/min": 0.00010258897236781195, + "sampling/sampling_logp_difference/max": 9.18478012084961, + "sampling/sampling_logp_difference/mean": 0.017056716606020927, + "step": 449 + }, + { + "clip_ratio/high_max": 2.1341018509701826e-05, + "clip_ratio/high_mean": 5.335254627425456e-06, + "clip_ratio/low_mean": 4.72563451694441e-05, + "clip_ratio/low_min": 6.4834025579330046e-06, + "clip_ratio/region_mean": 5.259159979686956e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15757.0, + "completions/max_terminated_length": 15757.0, + "completions/mean_length": 6514.875, + "completions/mean_terminated_length": 6514.875, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.9535354822874069, + "epoch": 0.4139834406623735, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025929149705916643, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 376183309.0, + "reward": 0.421875, + "reward_std": 0.28277361392974854, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998977184295654, + "sampling/importance_sampling_ratio/min": 0.002191081177443266, + "sampling/sampling_logp_difference/max": 6.1233601570129395, + "sampling/sampling_logp_difference/mean": 0.019740387797355652, + "step": 450 + }, + { + "clip_ratio/high_max": 1.2529956165963085e-05, + "clip_ratio/high_mean": 4.370210831439181e-06, + "clip_ratio/low_mean": 6.38160736343707e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.075181819487625e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15798.0, + "completions/mean_length": 6045.640625, + "completions/mean_terminated_length": 5964.236328125, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 1.0733412355184555, + "epoch": 0.41490340386384544, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023648168426007032, + "learning_rate": 1e-05, + "loss": 0.005, + "num_tokens": 376978175.0, + "reward": 0.421875, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999704360961914, + "sampling/importance_sampling_ratio/min": 0.0001392154226778075, + "sampling/sampling_logp_difference/max": 8.879487991333008, + "sampling/sampling_logp_difference/mean": 0.020569145679473877, + "step": 451 + }, + { + "clip_ratio/high_max": 4.286840976419626e-06, + "clip_ratio/high_mean": 1.0717102441049065e-06, + "clip_ratio/low_mean": 2.4207001501963532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5278711859755276e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7767.7578125, + "completions/mean_terminated_length": 7489.814453125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 1.0381295159459114, + "epoch": 0.41582336706531736, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0015338027151301503, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 377994592.0, + "reward": 0.4140625, + "reward_std": 0.14230038225650787, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999648332595825, + "sampling/importance_sampling_ratio/min": 8.825274733226252e-08, + "sampling/sampling_logp_difference/max": 16.243061065673828, + "sampling/sampling_logp_difference/mean": 0.02027149498462677, + "step": 452 + }, + { + "clip_ratio/high_max": 7.272515631484566e-06, + "clip_ratio/high_mean": 1.8181289078711416e-06, + "clip_ratio/low_mean": 2.767900923572597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949713825728395e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15264.0, + "completions/max_terminated_length": 15264.0, + "completions/mean_length": 7002.21875, + "completions/mean_terminated_length": 7002.21875, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 1.0032588243484497, + "epoch": 0.41674333026678934, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002184878336265683, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 378909468.0, + "reward": 0.4453125, + "reward_std": 0.17859894037246704, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321103096008, + "sampling/importance_sampling_ratio/min": 1.3040186786383856e-05, + "sampling/sampling_logp_difference/max": 11.247474670410156, + "sampling/sampling_logp_difference/mean": 0.02025642991065979, + "step": 453 + }, + { + "clip_ratio/high_max": 4.38227471022401e-06, + "clip_ratio/high_mean": 1.0955686775560025e-06, + "clip_ratio/low_mean": 2.8486808901106997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9582377578663e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 8433.3984375, + "completions/mean_terminated_length": 8042.384765625, + "completions/min_length": 1429.0, + "completions/min_terminated_length": 1429.0, + "entropy": 0.9339399412274361, + "epoch": 0.41766329346826125, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0015065330080688, + "learning_rate": 1e-05, + "loss": 0.0026, + "num_tokens": 380009687.0, + "reward": 0.3359375, + "reward_std": 0.17358636856079102, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999430179595947, + "sampling/importance_sampling_ratio/min": 0.0004234187363181263, + "sampling/sampling_logp_difference/max": 7.767148971557617, + "sampling/sampling_logp_difference/mean": 0.020081156864762306, + "step": 454 + }, + { + "clip_ratio/high_max": 1.8815874227584573e-05, + "clip_ratio/high_mean": 4.703968556896143e-06, + "clip_ratio/low_mean": 2.8154490735232685e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.285845917844199e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15186.0, + "completions/max_terminated_length": 15186.0, + "completions/mean_length": 7050.3203125, + "completions/mean_terminated_length": 7050.3203125, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "entropy": 0.9537717178463936, + "epoch": 0.41858325666973323, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0013606940628960729, + "learning_rate": 1e-05, + "loss": 0.0125, + "num_tokens": 380930480.0, + "reward": 0.578125, + "reward_std": 0.28407180309295654, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999956488609314, + "sampling/importance_sampling_ratio/min": 0.00011017238284694031, + "sampling/sampling_logp_difference/max": 9.11346435546875, + "sampling/sampling_logp_difference/mean": 0.020253805443644524, + "step": 455 + }, + { + "clip_ratio/high_max": 4.247366632625926e-06, + "clip_ratio/high_mean": 1.0618416581564816e-06, + "clip_ratio/low_mean": 2.397758157712815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5039423462658306e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15698.0, + "completions/max_terminated_length": 15698.0, + "completions/mean_length": 6561.1640625, + "completions/mean_terminated_length": 6561.1640625, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.9863667339086533, + "epoch": 0.41950321987120515, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017187768826261163, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 381790981.0, + "reward": 0.4375, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998313188552856, + "sampling/importance_sampling_ratio/min": 0.010767512023448944, + "sampling/sampling_logp_difference/max": 4.531221866607666, + "sampling/sampling_logp_difference/mean": 0.02073034644126892, + "step": 456 + }, + { + "clip_ratio/high_max": 2.9292289127624827e-05, + "clip_ratio/high_mean": 8.657401849632151e-06, + "clip_ratio/low_mean": 4.3774077425950964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2431478707148926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15705.0, + "completions/mean_length": 7120.1875, + "completions/mean_terminated_length": 6973.14306640625, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.9760185852646828, + "epoch": 0.4204231830726771, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016811270033940673, + "learning_rate": 1e-05, + "loss": 0.0804, + "num_tokens": 382722173.0, + "reward": 0.421875, + "reward_std": 0.27670514583587646, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999004602432251, + "sampling/importance_sampling_ratio/min": 0.0008047398878261447, + "sampling/sampling_logp_difference/max": 7.124991416931152, + "sampling/sampling_logp_difference/mean": 0.02018534392118454, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.610178137274488e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.610178137274488e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 7057.1015625, + "completions/mean_terminated_length": 6833.25634765625, + "completions/min_length": 922.0, + "completions/min_terminated_length": 922.0, + "entropy": 0.948130652308464, + "epoch": 0.42134314627414904, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015492907259613276, + "learning_rate": 1e-05, + "loss": 0.0319, + "num_tokens": 383650426.0, + "reward": 0.421875, + "reward_std": 0.21040895581245422, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999640583992004, + "sampling/importance_sampling_ratio/min": 0.003965416923165321, + "sampling/sampling_logp_difference/max": 5.530144214630127, + "sampling/sampling_logp_difference/mean": 0.02065262943506241, + "step": 458 + }, + { + "clip_ratio/high_max": 8.952108146331739e-06, + "clip_ratio/high_mean": 2.2380270365829347e-06, + "clip_ratio/low_mean": 2.777617066840321e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.001419747761247e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7001.7578125, + "completions/mean_terminated_length": 6852.83349609375, + "completions/min_length": 1065.0, + "completions/min_terminated_length": 1065.0, + "entropy": 0.9631693065166473, + "epoch": 0.42226310947562096, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0013419219758361578, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 384565995.0, + "reward": 0.390625, + "reward_std": 0.18701860308647156, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999476671218872, + "sampling/importance_sampling_ratio/min": 0.0006672164890915155, + "sampling/sampling_logp_difference/max": 7.312396049499512, + "sampling/sampling_logp_difference/mean": 0.01975739374756813, + "step": 459 + }, + { + "clip_ratio/high_max": 1.215636098095274e-05, + "clip_ratio/high_mean": 3.039090245238185e-06, + "clip_ratio/low_mean": 4.157363855483709e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4612729197979206e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15727.0, + "completions/mean_length": 7282.875, + "completions/mean_terminated_length": 6912.91015625, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9037974923849106, + "epoch": 0.42318307267709293, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021072588860988617, + "learning_rate": 1e-05, + "loss": 0.0866, + "num_tokens": 385516659.0, + "reward": 0.359375, + "reward_std": 0.3277292251586914, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0013449778780341148, + "sampling/sampling_logp_difference/max": 6.611377716064453, + "sampling/sampling_logp_difference/mean": 0.018494941294193268, + "step": 460 + }, + { + "clip_ratio/high_max": 1.669851098995423e-05, + "clip_ratio/high_mean": 4.174627747488557e-06, + "clip_ratio/low_mean": 2.594786496956658e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0122492944428814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14783.0, + "completions/mean_length": 7063.6953125, + "completions/mean_terminated_length": 6840.00830078125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9738125056028366, + "epoch": 0.42410303587856485, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020963819697499275, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 386440556.0, + "reward": 0.4765625, + "reward_std": 0.28930407762527466, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999623894691467, + "sampling/importance_sampling_ratio/min": 7.853446390981844e-07, + "sampling/sampling_logp_difference/max": 14.057143211364746, + "sampling/sampling_logp_difference/mean": 0.0198366716504097, + "step": 461 + }, + { + "clip_ratio/high_max": 3.949322490370832e-06, + "clip_ratio/high_mean": 9.87330622592708e-07, + "clip_ratio/low_mean": 1.8185473095400084e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9172803717992792e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15651.0, + "completions/mean_length": 7672.7421875, + "completions/mean_terminated_length": 7262.0244140625, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 1.0194172486662865, + "epoch": 0.4250229990800368, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014175203396007419, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 387450843.0, + "reward": 0.4609375, + "reward_std": 0.24541424214839935, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999865889549255, + "sampling/importance_sampling_ratio/min": 0.004501644056290388, + "sampling/sampling_logp_difference/max": 5.403312683105469, + "sampling/sampling_logp_difference/mean": 0.02058412693440914, + "step": 462 + }, + { + "clip_ratio/high_max": 2.1894326664551045e-05, + "clip_ratio/high_mean": 6.6363724613438535e-06, + "clip_ratio/low_mean": 8.431412652498693e-05, + "clip_ratio/low_min": 3.288245125077083e-05, + "clip_ratio/region_mean": 9.095049927054788e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 6846.8828125, + "completions/mean_terminated_length": 6459.19482421875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.886472262442112, + "epoch": 0.42594296228150874, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002659202553331852, + "learning_rate": 1e-05, + "loss": 0.1199, + "num_tokens": 388344660.0, + "reward": 0.34375, + "reward_std": 0.40267258882522583, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000640153884888, + "sampling/importance_sampling_ratio/min": 0.00015848172188270837, + "sampling/sampling_logp_difference/max": 8.749871253967285, + "sampling/sampling_logp_difference/mean": 0.018909990787506104, + "step": 463 + }, + { + "clip_ratio/high_max": 1.3184767340135295e-05, + "clip_ratio/high_mean": 3.2961918350338237e-06, + "clip_ratio/low_mean": 4.2340758909631404e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.563695051729155e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 6271.84375, + "completions/mean_terminated_length": 6029.15234375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.9538674280047417, + "epoch": 0.42686292548298066, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002775643253698945, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 389167344.0, + "reward": 0.484375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000534057617188, + "sampling/importance_sampling_ratio/min": 0.0022844907362014055, + "sampling/sampling_logp_difference/max": 6.0816121101379395, + "sampling/sampling_logp_difference/mean": 0.020731300115585327, + "step": 464 + }, + { + "clip_ratio/high_max": 5.017863713874249e-06, + "clip_ratio/high_mean": 1.2544659284685622e-06, + "clip_ratio/low_mean": 3.720694280673342e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.846140884888882e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6312.9765625, + "completions/mean_terminated_length": 6233.67724609375, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "entropy": 0.937890075147152, + "epoch": 0.42778288868445263, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001834206865169108, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 389993613.0, + "reward": 0.484375, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000054836273193, + "sampling/importance_sampling_ratio/min": 0.0004770693776663393, + "sampling/sampling_logp_difference/max": 7.647848606109619, + "sampling/sampling_logp_difference/mean": 0.020461473613977432, + "step": 465 + }, + { + "clip_ratio/high_max": 1.484874360357935e-05, + "clip_ratio/high_mean": 3.7121859008948377e-06, + "clip_ratio/low_mean": 3.374425170932227e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7456437212313176e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15638.0, + "completions/mean_length": 5643.125, + "completions/mean_terminated_length": 5385.34423828125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.9210820645093918, + "epoch": 0.42870285188592455, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015243689995259047, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 390735629.0, + "reward": 0.4765625, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998995661735535, + "sampling/importance_sampling_ratio/min": 1.4338597509322426e-07, + "sampling/sampling_logp_difference/max": 15.757725715637207, + "sampling/sampling_logp_difference/mean": 0.01841399073600769, + "step": 466 + }, + { + "clip_ratio/high_max": 5.748976491304347e-06, + "clip_ratio/high_mean": 1.4372441228260868e-06, + "clip_ratio/low_mean": 3.702218441503646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.845942796942836e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16084.0, + "completions/mean_length": 8048.40625, + "completions/mean_terminated_length": 7848.3525390625, + "completions/min_length": 1236.0, + "completions/min_terminated_length": 1236.0, + "entropy": 1.048905499279499, + "epoch": 0.4296228150873965, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026433062739670277, + "learning_rate": 1e-05, + "loss": 0.0548, + "num_tokens": 391786761.0, + "reward": 0.265625, + "reward_std": 0.22962789237499237, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000020980834961, + "sampling/importance_sampling_ratio/min": 0.0006000763387419283, + "sampling/sampling_logp_difference/max": 7.418453693389893, + "sampling/sampling_logp_difference/mean": 0.021647389978170395, + "step": 467 + }, + { + "clip_ratio/high_max": 2.0228523908372154e-05, + "clip_ratio/high_mean": 5.057130977093038e-06, + "clip_ratio/low_mean": 5.334191632755392e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.839904770255089e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16197.0, + "completions/mean_length": 7073.078125, + "completions/mean_terminated_length": 6772.7255859375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 1.0020805671811104, + "epoch": 0.43054277828886844, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019431376131251454, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 392709699.0, + "reward": 0.4140625, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999632239341736, + "sampling/importance_sampling_ratio/min": 0.0003546403022482991, + "sampling/sampling_logp_difference/max": 7.944406509399414, + "sampling/sampling_logp_difference/mean": 0.020886382088065147, + "step": 468 + }, + { + "clip_ratio/high_max": 8.001388550837873e-06, + "clip_ratio/high_mean": 2.0003471377094684e-06, + "clip_ratio/low_mean": 5.976677766739158e-05, + "clip_ratio/low_min": 1.2241466720297467e-05, + "clip_ratio/region_mean": 6.176712395244977e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16249.0, + "completions/mean_length": 7128.5390625, + "completions/mean_terminated_length": 6981.62744140625, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.9986839666962624, + "epoch": 0.43146274149034036, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002014609519392252, + "learning_rate": 1e-05, + "loss": 0.0787, + "num_tokens": 393643864.0, + "reward": 0.265625, + "reward_std": 0.3411741852760315, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000815391540527, + "sampling/importance_sampling_ratio/min": 0.0030073157977312803, + "sampling/sampling_logp_difference/max": 5.806707382202148, + "sampling/sampling_logp_difference/mean": 0.020323367789387703, + "step": 469 + }, + { + "clip_ratio/high_max": 1.0874447525566211e-05, + "clip_ratio/high_mean": 2.7186118813915527e-06, + "clip_ratio/low_mean": 3.265329507939896e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.537190696079051e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14445.0, + "completions/mean_length": 5538.625, + "completions/mean_terminated_length": 5366.4765625, + "completions/min_length": 1149.0, + "completions/min_terminated_length": 1149.0, + "entropy": 1.0297009721398354, + "epoch": 0.43238270469181234, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019020825857296586, + "learning_rate": 1e-05, + "loss": 0.0277, + "num_tokens": 394371184.0, + "reward": 0.3515625, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.00010906249372055754, + "sampling/sampling_logp_difference/max": 9.123589515686035, + "sampling/sampling_logp_difference/mean": 0.01992623880505562, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.427005844969244e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.427005844969244e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16072.0, + "completions/mean_length": 7305.7109375, + "completions/mean_terminated_length": 7087.83251953125, + "completions/min_length": 1106.0, + "completions/min_terminated_length": 1106.0, + "entropy": 0.9444865211844444, + "epoch": 0.43330266789328425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037416366394609213, + "learning_rate": 1e-05, + "loss": 0.07, + "num_tokens": 395325427.0, + "reward": 0.375, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999666213989258, + "sampling/importance_sampling_ratio/min": 1.3054028613623814e-06, + "sampling/sampling_logp_difference/max": 13.548998832702637, + "sampling/sampling_logp_difference/mean": 0.02093587815761566, + "step": 471 + }, + { + "clip_ratio/high_max": 1.0206378192378907e-05, + "clip_ratio/high_mean": 2.5515945480947266e-06, + "clip_ratio/low_mean": 2.926629849753226e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.181789293194015e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16266.0, + "completions/mean_length": 6020.71875, + "completions/mean_terminated_length": 5686.4189453125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9555193856358528, + "epoch": 0.43422263109475623, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003123396774753928, + "learning_rate": 1e-05, + "loss": 0.0906, + "num_tokens": 396118047.0, + "reward": 0.375, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 9.029568900587037e-05, + "sampling/sampling_logp_difference/max": 9.312420845031738, + "sampling/sampling_logp_difference/mean": 0.019349105656147003, + "step": 472 + }, + { + "clip_ratio/high_max": 7.391638519038679e-06, + "clip_ratio/high_mean": 1.8479096297596698e-06, + "clip_ratio/low_mean": 4.082024281615304e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.266815255959955e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16177.0, + "completions/mean_length": 6789.203125, + "completions/mean_terminated_length": 6149.55029296875, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.8103364855051041, + "epoch": 0.43514259429622815, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017067189328372478, + "learning_rate": 1e-05, + "loss": 0.0618, + "num_tokens": 397008497.0, + "reward": 0.421875, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000635385513306, + "sampling/importance_sampling_ratio/min": 1.8778002868202748e-06, + "sampling/sampling_logp_difference/max": 13.185409545898438, + "sampling/sampling_logp_difference/mean": 0.01813405565917492, + "step": 473 + }, + { + "clip_ratio/high_max": 3.4544700611149892e-06, + "clip_ratio/high_mean": 1.6775043150119018e-06, + "clip_ratio/low_mean": 3.894365818268852e-05, + "clip_ratio/low_min": 3.4544700611149892e-06, + "clip_ratio/region_mean": 4.0621162042953074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 8000.53125, + "completions/mean_terminated_length": 7934.51953125, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "entropy": 1.0201406553387642, + "epoch": 0.43606255749770007, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001533582923002541, + "learning_rate": 1e-05, + "loss": 0.0826, + "num_tokens": 398052373.0, + "reward": 0.328125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000503063201904, + "sampling/importance_sampling_ratio/min": 3.783419288083678e-06, + "sampling/sampling_logp_difference/max": 12.484882354736328, + "sampling/sampling_logp_difference/mean": 0.02113974839448929, + "step": 474 + }, + { + "clip_ratio/high_max": 5.666878223564709e-06, + "clip_ratio/high_mean": 1.4167195558911772e-06, + "clip_ratio/low_mean": 1.8879915842262562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0296635739214253e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15121.0, + "completions/max_terminated_length": 15121.0, + "completions/mean_length": 6122.6875, + "completions/mean_terminated_length": 6122.6875, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 1.0430640205740929, + "epoch": 0.43698252069917204, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025845973286777735, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 398855205.0, + "reward": 0.5, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999109506607056, + "sampling/importance_sampling_ratio/min": 3.3893353247549385e-05, + "sampling/sampling_logp_difference/max": 10.292291641235352, + "sampling/sampling_logp_difference/mean": 0.020821597427129745, + "step": 475 + }, + { + "clip_ratio/high_max": 6.862502914373181e-06, + "clip_ratio/high_mean": 1.7156257285932952e-06, + "clip_ratio/low_mean": 3.732125173883105e-05, + "clip_ratio/low_min": 3.870448381348979e-06, + "clip_ratio/region_mean": 3.9036877069520415e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16308.0, + "completions/mean_length": 6895.4453125, + "completions/mean_terminated_length": 6820.732421875, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 1.097649298608303, + "epoch": 0.43790248390064396, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00137829699087888, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 399758166.0, + "reward": 0.2890625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999674558639526, + "sampling/importance_sampling_ratio/min": 8.400417755183298e-06, + "sampling/sampling_logp_difference/max": 11.68722915649414, + "sampling/sampling_logp_difference/mean": 0.02135382406413555, + "step": 476 + }, + { + "clip_ratio/high_max": 8.859707577357767e-06, + "clip_ratio/high_mean": 2.2149268943394418e-06, + "clip_ratio/low_mean": 3.0371424600161845e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.258635138081445e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14854.0, + "completions/mean_length": 5552.265625, + "completions/mean_terminated_length": 5380.33349609375, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 0.9384580478072166, + "epoch": 0.43882244710211593, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002469305880367756, + "learning_rate": 1e-05, + "loss": 0.0868, + "num_tokens": 400488560.0, + "reward": 0.515625, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998993277549744, + "sampling/importance_sampling_ratio/min": 1.934680221893359e-05, + "sampling/sampling_logp_difference/max": 10.852983474731445, + "sampling/sampling_logp_difference/mean": 0.019046220928430557, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.751295116671827e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.751295116671827e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6492.8125, + "completions/mean_terminated_length": 6335.81005859375, + "completions/min_length": 1238.0, + "completions/min_terminated_length": 1238.0, + "entropy": 0.9447641968727112, + "epoch": 0.43974241030358785, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019261077977716923, + "learning_rate": 1e-05, + "loss": 0.0684, + "num_tokens": 401339544.0, + "reward": 0.359375, + "reward_std": 0.27221953868865967, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999949932098389, + "sampling/importance_sampling_ratio/min": 0.016565052792429924, + "sampling/sampling_logp_difference/max": 4.100460052490234, + "sampling/sampling_logp_difference/mean": 0.018938450142741203, + "step": 478 + }, + { + "clip_ratio/high_max": 1.0270573739035171e-05, + "clip_ratio/high_mean": 2.567643434758793e-06, + "clip_ratio/low_mean": 3.2130441354638606e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4698084505180304e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 6688.5546875, + "completions/mean_terminated_length": 6211.72900390625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9593756124377251, + "epoch": 0.4406623735050598, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027252996806055307, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 402213983.0, + "reward": 0.4375, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 0.09333998709917068, + "sampling/sampling_logp_difference/max": 2.371506690979004, + "sampling/sampling_logp_difference/mean": 0.020656028762459755, + "step": 479 + }, + { + "clip_ratio/high_max": 4.220976734359283e-06, + "clip_ratio/high_mean": 1.0552441835898208e-06, + "clip_ratio/low_mean": 2.7019574872610974e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.807481928357447e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15739.0, + "completions/mean_length": 6957.8828125, + "completions/mean_terminated_length": 6808.26220703125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.9458145052194595, + "epoch": 0.44158233670653174, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021163993515074253, + "learning_rate": 1e-05, + "loss": -0.0054, + "num_tokens": 403124296.0, + "reward": 0.3125, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000032186508179, + "sampling/importance_sampling_ratio/min": 5.414607926468307e-07, + "sampling/sampling_logp_difference/max": 14.428995132446289, + "sampling/sampling_logp_difference/mean": 0.019670519977808, + "step": 480 + }, + { + "clip_ratio/high_max": 1.4141203109829803e-05, + "clip_ratio/high_mean": 4.24627120310106e-06, + "clip_ratio/low_mean": 3.319961399483873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7445884800035856e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 7141.8359375, + "completions/mean_terminated_length": 6843.701171875, + "completions/min_length": 1005.0, + "completions/min_terminated_length": 1005.0, + "entropy": 0.9727424532175064, + "epoch": 0.44250229990800366, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024569793604314327, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 404056571.0, + "reward": 0.421875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999956488609314, + "sampling/importance_sampling_ratio/min": 8.950789379014168e-06, + "sampling/sampling_logp_difference/max": 11.62376880645752, + "sampling/sampling_logp_difference/mean": 0.020752113312482834, + "step": 481 + }, + { + "clip_ratio/high_max": 1.5587193956889678e-05, + "clip_ratio/high_mean": 4.596514145305264e-06, + "clip_ratio/low_mean": 6.96504166626255e-05, + "clip_ratio/low_min": 7.279775445567793e-06, + "clip_ratio/region_mean": 7.424693194479914e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 7685.046875, + "completions/mean_terminated_length": 7476.2724609375, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9029846489429474, + "epoch": 0.44342226310947563, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019990119617432356, + "learning_rate": 1e-05, + "loss": 0.1109, + "num_tokens": 405058705.0, + "reward": 0.421875, + "reward_std": 0.38375797867774963, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999930262565613, + "sampling/importance_sampling_ratio/min": 0.002107172505930066, + "sampling/sampling_logp_difference/max": 6.162408351898193, + "sampling/sampling_logp_difference/mean": 0.01937328279018402, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7506703443359584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7506703443359584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7004.21875, + "completions/mean_terminated_length": 6779.1044921875, + "completions/min_length": 936.0, + "completions/min_terminated_length": 936.0, + "entropy": 0.9121566936373711, + "epoch": 0.44434222631094755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029584914445877075, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 405974789.0, + "reward": 0.5234375, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000674724578857, + "sampling/importance_sampling_ratio/min": 0.000792751437984407, + "sampling/sampling_logp_difference/max": 7.140000820159912, + "sampling/sampling_logp_difference/mean": 0.019368886947631836, + "step": 483 + }, + { + "clip_ratio/high_max": 1.2470530009522918e-05, + "clip_ratio/high_mean": 3.1176325023807294e-06, + "clip_ratio/low_mean": 3.606646794196422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918410050118837e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 6294.90625, + "completions/mean_terminated_length": 6215.46435546875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.899978794157505, + "epoch": 0.4452621895124195, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001394490827806294, + "learning_rate": 1e-05, + "loss": 0.0376, + "num_tokens": 406798417.0, + "reward": 0.4296875, + "reward_std": 0.2577856183052063, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000015497207642, + "sampling/importance_sampling_ratio/min": 0.0007101757801137865, + "sampling/sampling_logp_difference/max": 7.249998092651367, + "sampling/sampling_logp_difference/mean": 0.018764980137348175, + "step": 484 + }, + { + "clip_ratio/high_max": 1.568959305586759e-05, + "clip_ratio/high_mean": 3.9223982639668975e-06, + "clip_ratio/low_mean": 3.593084011299652e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.985323814958974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15685.0, + "completions/mean_length": 6940.046875, + "completions/mean_terminated_length": 6790.14306640625, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9006319642066956, + "epoch": 0.44618215271389144, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002361331367865205, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 407703351.0, + "reward": 0.4453125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999904036521912, + "sampling/importance_sampling_ratio/min": 4.8537625843891874e-05, + "sampling/sampling_logp_difference/max": 9.933171272277832, + "sampling/sampling_logp_difference/mean": 0.019578561186790466, + "step": 485 + }, + { + "clip_ratio/high_max": 5.896504717384232e-06, + "clip_ratio/high_mean": 1.474126179346058e-06, + "clip_ratio/low_mean": 4.614499187027832e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7619118163311214e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 6362.484375, + "completions/mean_terminated_length": 6283.57470703125, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 0.9299133494496346, + "epoch": 0.44710211591536336, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027458088006824255, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 408537765.0, + "reward": 0.4296875, + "reward_std": 0.3595392107963562, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999920129776001, + "sampling/importance_sampling_ratio/min": 0.0007113060564734042, + "sampling/sampling_logp_difference/max": 7.24840784072876, + "sampling/sampling_logp_difference/mean": 0.019821636378765106, + "step": 486 + }, + { + "clip_ratio/high_max": 2.0891785879939562e-05, + "clip_ratio/high_mean": 7.879635973040422e-06, + "clip_ratio/low_mean": 2.6475246386326035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.435488224567962e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15924.0, + "completions/max_terminated_length": 15924.0, + "completions/mean_length": 5226.765625, + "completions/mean_terminated_length": 5226.765625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 1.0277203470468521, + "epoch": 0.44802207911683534, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024366467259824276, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 409223903.0, + "reward": 0.546875, + "reward_std": 0.3006146252155304, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044584274292, + "sampling/importance_sampling_ratio/min": 0.01590813137590885, + "sampling/sampling_logp_difference/max": 4.14092493057251, + "sampling/sampling_logp_difference/mean": 0.019991066306829453, + "step": 487 + }, + { + "clip_ratio/high_max": 9.688145382824587e-06, + "clip_ratio/high_mean": 2.4220363457061467e-06, + "clip_ratio/low_mean": 1.920005956890236e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.162209625566902e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12344.0, + "completions/max_terminated_length": 12344.0, + "completions/mean_length": 5051.0, + "completions/mean_terminated_length": 5051.0, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.8572651967406273, + "epoch": 0.44894204231830726, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027017516549676657, + "learning_rate": 1e-05, + "loss": -0.003, + "num_tokens": 409895199.0, + "reward": 0.6015625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940037727356, + "sampling/importance_sampling_ratio/min": 5.7065666624112055e-05, + "sampling/sampling_logp_difference/max": 9.771307945251465, + "sampling/sampling_logp_difference/mean": 0.01831716299057007, + "step": 488 + }, + { + "clip_ratio/high_max": 1.5306721707020188e-05, + "clip_ratio/high_mean": 3.826680426755047e-06, + "clip_ratio/low_mean": 3.0764163398089295e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4590844165904855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13702.0, + "completions/mean_length": 6231.9765625, + "completions/mean_terminated_length": 6070.83349609375, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 0.9115571528673172, + "epoch": 0.44986200551977923, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021461176220327616, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 410711300.0, + "reward": 0.4765625, + "reward_std": 0.2672119140625, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000066757202148, + "sampling/importance_sampling_ratio/min": 0.00019801831513177603, + "sampling/sampling_logp_difference/max": 8.527151107788086, + "sampling/sampling_logp_difference/mean": 0.019596103578805923, + "step": 489 + }, + { + "clip_ratio/high_max": 2.7797910661320202e-05, + "clip_ratio/high_mean": 9.322406867795507e-06, + "clip_ratio/low_mean": 6.275825364809862e-05, + "clip_ratio/low_min": 3.0194694318197435e-06, + "clip_ratio/region_mean": 7.208066119801515e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16059.0, + "completions/mean_length": 6766.4765625, + "completions/mean_terminated_length": 6375.52001953125, + "completions/min_length": 764.0, + "completions/min_terminated_length": 764.0, + "entropy": 0.8712737187743187, + "epoch": 0.45078196872125115, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019740054849535227, + "learning_rate": 1e-05, + "loss": 0.091, + "num_tokens": 411597969.0, + "reward": 0.4609375, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 7.488903065677732e-05, + "sampling/sampling_logp_difference/max": 9.499503135681152, + "sampling/sampling_logp_difference/mean": 0.018991166725754738, + "step": 490 + }, + { + "clip_ratio/high_max": 4.992810318071861e-06, + "clip_ratio/high_mean": 1.2482025795179652e-06, + "clip_ratio/low_mean": 1.100720277236178e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.2255405295036326e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14776.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 6619.1171875, + "completions/mean_terminated_length": 6619.1171875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.1462209969758987, + "epoch": 0.45170193192272307, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.001665184274315834, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 412464384.0, + "reward": 0.3046875, + "reward_std": 0.17806214094161987, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999956488609314, + "sampling/importance_sampling_ratio/min": 0.009808298200368881, + "sampling/sampling_logp_difference/max": 4.624526500701904, + "sampling/sampling_logp_difference/mean": 0.02124062180519104, + "step": 491 + }, + { + "clip_ratio/high_max": 1.5520400665991474e-05, + "clip_ratio/high_mean": 3.8801001664978685e-06, + "clip_ratio/low_mean": 2.0763711063409573e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.464381134359428e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 7035.25, + "completions/mean_terminated_length": 6886.857421875, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.9810440614819527, + "epoch": 0.45262189512419504, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015779118984937668, + "learning_rate": 1e-05, + "loss": 0.0582, + "num_tokens": 413383792.0, + "reward": 0.4453125, + "reward_std": 0.21436068415641785, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999344944953918, + "sampling/importance_sampling_ratio/min": 0.01566622592508793, + "sampling/sampling_logp_difference/max": 4.156248092651367, + "sampling/sampling_logp_difference/mean": 0.021432677283883095, + "step": 492 + }, + { + "clip_ratio/high_max": 4.644250566343544e-06, + "clip_ratio/high_mean": 1.161062641585886e-06, + "clip_ratio/low_mean": 3.4143843777201255e-05, + "clip_ratio/low_min": 3.276024699516711e-06, + "clip_ratio/region_mean": 3.530490653247398e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15869.0, + "completions/mean_length": 6945.9375, + "completions/mean_terminated_length": 6796.12744140625, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.7932121306657791, + "epoch": 0.45354185832566696, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013254050863906741, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 414290000.0, + "reward": 0.4921875, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999009370803833, + "sampling/importance_sampling_ratio/min": 7.031726272543892e-05, + "sampling/sampling_logp_difference/max": 9.562493324279785, + "sampling/sampling_logp_difference/mean": 0.018196485936641693, + "step": 493 + }, + { + "clip_ratio/high_max": 1.8977402305608848e-05, + "clip_ratio/high_mean": 4.744350576402212e-06, + "clip_ratio/low_mean": 3.744401988114987e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218837011649157e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14930.0, + "completions/mean_length": 7526.4375, + "completions/mean_terminated_length": 7313.8564453125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.9790460616350174, + "epoch": 0.45446182152713893, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001866620616056025, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 415272280.0, + "reward": 0.4140625, + "reward_std": 0.2517249584197998, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998944997787476, + "sampling/importance_sampling_ratio/min": 0.00020347593817859888, + "sampling/sampling_logp_difference/max": 8.49996280670166, + "sampling/sampling_logp_difference/mean": 0.020433884114027023, + "step": 494 + }, + { + "clip_ratio/high_max": 7.432954589603469e-06, + "clip_ratio/high_mean": 3.44574186783575e-06, + "clip_ratio/low_mean": 4.426451175731927e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7710253397781344e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15964.0, + "completions/mean_length": 6108.8671875, + "completions/mean_terminated_length": 5862.26416015625, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "entropy": 0.8818904608488083, + "epoch": 0.45538178472861085, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002431972650811076, + "learning_rate": 1e-05, + "loss": 0.0175, + "num_tokens": 416072591.0, + "reward": 0.59375, + "reward_std": 0.26720699667930603, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999450445175171, + "sampling/importance_sampling_ratio/min": 0.001706472015939653, + "sampling/sampling_logp_difference/max": 6.373327255249023, + "sampling/sampling_logp_difference/mean": 0.01932165026664734, + "step": 495 + }, + { + "clip_ratio/high_max": 9.704292551759863e-06, + "clip_ratio/high_mean": 2.426073137939966e-06, + "clip_ratio/low_mean": 1.47394894156605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7165562553600466e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15239.0, + "completions/max_terminated_length": 15239.0, + "completions/mean_length": 6841.59375, + "completions/mean_terminated_length": 6841.59375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 1.1732418313622475, + "epoch": 0.4563017479300828, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002203838201239705, + "learning_rate": 1e-05, + "loss": 0.0308, + "num_tokens": 416966187.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998947381973267, + "sampling/importance_sampling_ratio/min": 0.0004944052780047059, + "sampling/sampling_logp_difference/max": 7.612154960632324, + "sampling/sampling_logp_difference/mean": 0.02160799130797386, + "step": 496 + }, + { + "clip_ratio/high_max": 2.328647701688169e-05, + "clip_ratio/high_mean": 5.821619254220423e-06, + "clip_ratio/low_mean": 5.462882245410583e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0450441651482834e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13836.0, + "completions/max_terminated_length": 13836.0, + "completions/mean_length": 5898.7421875, + "completions/mean_terminated_length": 5898.7421875, + "completions/min_length": 675.0, + "completions/min_terminated_length": 675.0, + "entropy": 0.9141146093606949, + "epoch": 0.45722171113155474, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028326623141765594, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 417740586.0, + "reward": 0.4453125, + "reward_std": 0.32984596490859985, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998853206634521, + "sampling/importance_sampling_ratio/min": 1.0281119102728553e-06, + "sampling/sampling_logp_difference/max": 13.787786483764648, + "sampling/sampling_logp_difference/mean": 0.01856965571641922, + "step": 497 + }, + { + "clip_ratio/high_max": 2.667783610377228e-05, + "clip_ratio/high_mean": 6.66945902594307e-06, + "clip_ratio/low_mean": 4.455613873233233e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.122559878145694e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 6416.140625, + "completions/mean_terminated_length": 6176.912109375, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.8854602724313736, + "epoch": 0.45814167433302666, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001950124162249267, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 418579788.0, + "reward": 0.5078125, + "reward_std": 0.25012245774269104, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998780488967896, + "sampling/importance_sampling_ratio/min": 2.6581541533232667e-05, + "sampling/sampling_logp_difference/max": 10.535293579101562, + "sampling/sampling_logp_difference/mean": 0.01931869424879551, + "step": 498 + }, + { + "clip_ratio/high_max": 3.6452713629842037e-06, + "clip_ratio/high_mean": 9.113178407460509e-07, + "clip_ratio/low_mean": 3.819847256636422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910979035026685e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15548.0, + "completions/mean_length": 7738.2578125, + "completions/mean_terminated_length": 7313.05712890625, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.9239770472049713, + "epoch": 0.45906163753449863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016899642068892717, + "learning_rate": 1e-05, + "loss": 0.0844, + "num_tokens": 419589021.0, + "reward": 0.375, + "reward_std": 0.20069600641727448, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000193119049072, + "sampling/importance_sampling_ratio/min": 0.00016869053069967777, + "sampling/sampling_logp_difference/max": 8.687444686889648, + "sampling/sampling_logp_difference/mean": 0.01966589316725731, + "step": 499 + }, + { + "clip_ratio/high_max": 1.0700351140258135e-05, + "clip_ratio/high_mean": 2.675087785064534e-06, + "clip_ratio/low_mean": 3.456382330568886e-05, + "clip_ratio/low_min": 4.663483196054585e-06, + "clip_ratio/region_mean": 3.723891120444023e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 7594.921875, + "completions/mean_terminated_length": 7383.984375, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "entropy": 0.9970445707440376, + "epoch": 0.45998160073597055, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026633136440068483, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 420579459.0, + "reward": 0.40625, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 0.000756366120185703, + "sampling/sampling_logp_difference/max": 7.186985015869141, + "sampling/sampling_logp_difference/mean": 0.020969431847333908, + "step": 500 + }, + { + "clip_ratio/high_max": 2.166650710933027e-05, + "clip_ratio/high_mean": 6.6261792426303145e-06, + "clip_ratio/low_mean": 5.730952580051962e-05, + "clip_ratio/low_min": 4.826068561669672e-06, + "clip_ratio/region_mean": 6.393570629370515e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14856.0, + "completions/max_terminated_length": 14856.0, + "completions/mean_length": 5897.2890625, + "completions/mean_terminated_length": 5897.2890625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.9427390918135643, + "epoch": 0.4609015639374425, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015446916222572327, + "learning_rate": 1e-05, + "loss": -0.0487, + "num_tokens": 421354536.0, + "reward": 0.40625, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000121593475342, + "sampling/importance_sampling_ratio/min": 0.00037080893525853753, + "sampling/sampling_logp_difference/max": 7.8998236656188965, + "sampling/sampling_logp_difference/mean": 0.019464563578367233, + "step": 501 + }, + { + "clip_ratio/high_max": 3.1168960958893877e-06, + "clip_ratio/high_mean": 7.792240239723469e-07, + "clip_ratio/low_mean": 1.842527422013518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9204498244107526e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16365.0, + "completions/mean_length": 7197.1875, + "completions/mean_terminated_length": 6900.83837890625, + "completions/min_length": 1181.0, + "completions/min_terminated_length": 1181.0, + "entropy": 0.9357216581702232, + "epoch": 0.46182152713891444, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019460292533040047, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 422296632.0, + "reward": 0.4921875, + "reward_std": 0.20934812724590302, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0004937088815495372, + "sampling/sampling_logp_difference/max": 7.613564491271973, + "sampling/sampling_logp_difference/mean": 0.0199101734906435, + "step": 502 + }, + { + "clip_ratio/high_max": 3.01917771139415e-06, + "clip_ratio/high_mean": 7.547944278485375e-07, + "clip_ratio/low_mean": 2.4536840555811068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5291634983659605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16092.0, + "completions/mean_length": 6675.8515625, + "completions/mean_terminated_length": 6599.40966796875, + "completions/min_length": 1369.0, + "completions/min_terminated_length": 1369.0, + "entropy": 0.8980752006173134, + "epoch": 0.46274149034038636, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017490689642727375, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 423170085.0, + "reward": 0.484375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966025352478, + "sampling/importance_sampling_ratio/min": 4.0153237932827324e-05, + "sampling/sampling_logp_difference/max": 10.122807502746582, + "sampling/sampling_logp_difference/mean": 0.01868046447634697, + "step": 503 + }, + { + "clip_ratio/high_max": 1.4156895304040518e-05, + "clip_ratio/high_mean": 4.290660626793397e-06, + "clip_ratio/low_mean": 4.468955739866942e-05, + "clip_ratio/low_min": 3.951194685214432e-06, + "clip_ratio/region_mean": 4.898021779808914e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 7394.5546875, + "completions/mean_terminated_length": 6874.50390625, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.891602098941803, + "epoch": 0.46366145354185834, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026261890307068825, + "learning_rate": 1e-05, + "loss": 0.0981, + "num_tokens": 424134916.0, + "reward": 0.484375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0019415394635871053, + "sampling/sampling_logp_difference/max": 6.244274139404297, + "sampling/sampling_logp_difference/mean": 0.018863018602132797, + "step": 504 + }, + { + "clip_ratio/high_max": 4.867222287430195e-06, + "clip_ratio/high_mean": 1.2168055718575488e-06, + "clip_ratio/low_mean": 2.737805482411204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8594860509656428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 5508.3359375, + "completions/mean_terminated_length": 5422.70068359375, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.9608336761593819, + "epoch": 0.46458141674333026, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030600661411881447, + "learning_rate": 1e-05, + "loss": 0.0369, + "num_tokens": 424860847.0, + "reward": 0.5625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999776482582092, + "sampling/importance_sampling_ratio/min": 4.006533345091157e-05, + "sampling/sampling_logp_difference/max": 10.124999046325684, + "sampling/sampling_logp_difference/mean": 0.018935665488243103, + "step": 505 + }, + { + "clip_ratio/high_max": 1.3109260635246756e-05, + "clip_ratio/high_mean": 3.277315158811689e-06, + "clip_ratio/low_mean": 3.854507008327346e-05, + "clip_ratio/low_min": 2.992077043018071e-06, + "clip_ratio/region_mean": 4.182238512839831e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16077.0, + "completions/mean_length": 7779.4765625, + "completions/mean_terminated_length": 7572.96826171875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 1.0322985425591469, + "epoch": 0.46550137994480223, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002075409982353449, + "learning_rate": 1e-05, + "loss": 0.0939, + "num_tokens": 425877532.0, + "reward": 0.421875, + "reward_std": 0.3337898254394531, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999785423278809, + "sampling/importance_sampling_ratio/min": 0.025282513350248337, + "sampling/sampling_logp_difference/max": 3.677642345428467, + "sampling/sampling_logp_difference/mean": 0.020769601687788963, + "step": 506 + }, + { + "clip_ratio/high_max": 1.4176180684444262e-05, + "clip_ratio/high_mean": 4.564619985103491e-06, + "clip_ratio/low_mean": 2.2551324207142898e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7115944419620064e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15292.0, + "completions/mean_length": 6004.21875, + "completions/mean_terminated_length": 5755.1044921875, + "completions/min_length": 992.0, + "completions/min_terminated_length": 992.0, + "entropy": 0.9162944257259369, + "epoch": 0.46642134314627415, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0039940495043993, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 426666008.0, + "reward": 0.6328125, + "reward_std": 0.31140607595443726, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 5.144598981132731e-05, + "sampling/sampling_logp_difference/max": 9.874978065490723, + "sampling/sampling_logp_difference/mean": 0.01873711869120598, + "step": 507 + }, + { + "clip_ratio/high_max": 3.6937442473572446e-06, + "clip_ratio/high_mean": 9.234360618393112e-07, + "clip_ratio/low_mean": 3.4857803484555916e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.578123954639523e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14337.0, + "completions/mean_length": 6619.6015625, + "completions/mean_terminated_length": 6542.71630859375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 1.1118961870670319, + "epoch": 0.46734130634774607, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002274538855999708, + "learning_rate": 1e-05, + "loss": 0.0259, + "num_tokens": 427535397.0, + "reward": 0.3125, + "reward_std": 0.2177756428718567, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000343322753906, + "sampling/importance_sampling_ratio/min": 2.4061378098849673e-06, + "sampling/sampling_logp_difference/max": 12.937487602233887, + "sampling/sampling_logp_difference/mean": 0.0214434452354908, + "step": 508 + }, + { + "clip_ratio/high_max": 7.764184829284204e-06, + "clip_ratio/high_mean": 1.941046207321051e-06, + "clip_ratio/low_mean": 2.4530202267669665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6471248474990716e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15923.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6469.9765625, + "completions/mean_terminated_length": 6469.9765625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.8812271729111671, + "epoch": 0.46826126954921804, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020800349302589893, + "learning_rate": 1e-05, + "loss": 0.0592, + "num_tokens": 428379026.0, + "reward": 0.546875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999022483825684, + "sampling/importance_sampling_ratio/min": 9.611256973585114e-05, + "sampling/sampling_logp_difference/max": 9.249990463256836, + "sampling/sampling_logp_difference/mean": 0.01902790367603302, + "step": 509 + }, + { + "clip_ratio/high_max": 3.3670939956209622e-06, + "clip_ratio/high_mean": 8.417734989052406e-07, + "clip_ratio/low_mean": 3.1169882220183354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.201165577593201e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16189.0, + "completions/mean_length": 7417.2421875, + "completions/mean_terminated_length": 7346.6376953125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 1.0124703496694565, + "epoch": 0.46918123275068996, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013554802862927318, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 429347777.0, + "reward": 0.359375, + "reward_std": 0.24039676785469055, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 1.086339216271881e-05, + "sampling/sampling_logp_difference/max": 11.4301118850708, + "sampling/sampling_logp_difference/mean": 0.02034895122051239, + "step": 510 + }, + { + "clip_ratio/high_max": 2.4966960609162925e-05, + "clip_ratio/high_mean": 6.241740152290731e-06, + "clip_ratio/low_mean": 2.400768698862521e-05, + "clip_ratio/low_min": 7.9038825333555e-06, + "clip_ratio/region_mean": 3.0249426572481752e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 6539.7578125, + "completions/mean_terminated_length": 6383.50048828125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.9707148522138596, + "epoch": 0.47010119595216193, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016008630627766252, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 430203402.0, + "reward": 0.5078125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999600648880005, + "sampling/importance_sampling_ratio/min": 1.7258255269325673e-08, + "sampling/sampling_logp_difference/max": 17.874975204467773, + "sampling/sampling_logp_difference/mean": 0.01951115019619465, + "step": 511 + }, + { + "clip_ratio/high_max": 7.0406667873612605e-06, + "clip_ratio/high_mean": 1.7601666968403151e-06, + "clip_ratio/low_mean": 2.4132358305450907e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5892525002291222e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6722.53125, + "completions/mean_terminated_length": 6329.78857421875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9293247908353806, + "epoch": 0.47102115915363385, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002203655894845724, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 431082350.0, + "reward": 0.46875, + "reward_std": 0.18543371558189392, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999996542930603, + "sampling/importance_sampling_ratio/min": 0.002989979926496744, + "sampling/sampling_logp_difference/max": 5.812488555908203, + "sampling/sampling_logp_difference/mean": 0.018750539049506187, + "step": 512 + }, + { + "clip_ratio/high_max": 5.424876235338161e-06, + "clip_ratio/high_mean": 1.3562190588345402e-06, + "clip_ratio/low_mean": 2.538728870149498e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.674350776032952e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15874.0, + "completions/mean_length": 6347.03125, + "completions/mean_terminated_length": 5766.3798828125, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 0.9512053951621056, + "epoch": 0.47194112235510577, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002207641489803791, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 431914122.0, + "reward": 0.4765625, + "reward_std": 0.21648237109184265, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999993085861206, + "sampling/importance_sampling_ratio/min": 0.0011340104974806309, + "sampling/sampling_logp_difference/max": 6.781994819641113, + "sampling/sampling_logp_difference/mean": 0.01931341364979744, + "step": 513 + }, + { + "clip_ratio/high_max": 1.2328315506238141e-05, + "clip_ratio/high_mean": 3.0820788765595353e-06, + "clip_ratio/low_mean": 4.058695458297734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.366903374375397e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14757.0, + "completions/mean_length": 5719.8671875, + "completions/mean_terminated_length": 5635.8974609375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.9754309803247452, + "epoch": 0.47286108555657774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018057655543088913, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 432663249.0, + "reward": 0.4921875, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999573230743408, + "sampling/importance_sampling_ratio/min": 0.00016155402408912778, + "sampling/sampling_logp_difference/max": 8.730670928955078, + "sampling/sampling_logp_difference/mean": 0.019999589771032333, + "step": 514 + }, + { + "clip_ratio/high_max": 3.34771721099969e-05, + "clip_ratio/high_mean": 8.369293027499225e-06, + "clip_ratio/low_mean": 3.319342158647487e-05, + "clip_ratio/low_min": 3.644846174211125e-06, + "clip_ratio/region_mean": 4.1562714159226744e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 5969.1328125, + "completions/mean_terminated_length": 5803.81787109375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.9498241171240807, + "epoch": 0.47378104875804966, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002828414784744382, + "learning_rate": 1e-05, + "loss": 0.0843, + "num_tokens": 433448874.0, + "reward": 0.4375, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999672174453735, + "sampling/importance_sampling_ratio/min": 0.00043074661516584456, + "sampling/sampling_logp_difference/max": 7.749990463256836, + "sampling/sampling_logp_difference/mean": 0.019238140434026718, + "step": 515 + }, + { + "clip_ratio/high_max": 2.4458067855448462e-05, + "clip_ratio/high_mean": 7.50266553950496e-06, + "clip_ratio/low_mean": 4.7241341690096306e-05, + "clip_ratio/low_min": 4.075511242263019e-06, + "clip_ratio/region_mean": 5.4744006320106564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14714.0, + "completions/max_terminated_length": 14714.0, + "completions/mean_length": 6808.3671875, + "completions/mean_terminated_length": 6808.3671875, + "completions/min_length": 857.0, + "completions/min_terminated_length": 857.0, + "entropy": 0.9247330650687218, + "epoch": 0.47470101195952163, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019250004552304745, + "learning_rate": 1e-05, + "loss": 0.0535, + "num_tokens": 434338609.0, + "reward": 0.4921875, + "reward_std": 0.36007601022720337, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999670386314392, + "sampling/importance_sampling_ratio/min": 0.00025917106540873647, + "sampling/sampling_logp_difference/max": 8.25802230834961, + "sampling/sampling_logp_difference/mean": 0.01927364431321621, + "step": 516 + }, + { + "clip_ratio/high_max": 2.067027617158601e-05, + "clip_ratio/high_mean": 5.167569042896503e-06, + "clip_ratio/low_mean": 1.523887078747066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0406439944054e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 6119.921875, + "completions/mean_terminated_length": 6039.1025390625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.9210109040141106, + "epoch": 0.47562097516099355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022343189921230078, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 435145247.0, + "reward": 0.5, + "reward_std": 0.2467075139284134, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998999834060669, + "sampling/importance_sampling_ratio/min": 0.00011216365965083241, + "sampling/sampling_logp_difference/max": 9.095551490783691, + "sampling/sampling_logp_difference/mean": 0.019618261605501175, + "step": 517 + }, + { + "clip_ratio/high_max": 1.9286600036139134e-05, + "clip_ratio/high_mean": 4.821650009034784e-06, + "clip_ratio/low_mean": 3.679497240227647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1616622866058606e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16282.0, + "completions/mean_length": 6259.0625, + "completions/mean_terminated_length": 6179.33837890625, + "completions/min_length": 1087.0, + "completions/min_terminated_length": 1087.0, + "entropy": 0.9430939853191376, + "epoch": 0.4765409383624655, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00324260420165956, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 435964383.0, + "reward": 0.5, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999647736549377, + "sampling/importance_sampling_ratio/min": 1.5690335203544237e-05, + "sampling/sampling_logp_difference/max": 11.06246566772461, + "sampling/sampling_logp_difference/mean": 0.019678015261888504, + "step": 518 + }, + { + "clip_ratio/high_max": 5.182851054996718e-06, + "clip_ratio/high_mean": 1.2957127637491794e-06, + "clip_ratio/low_mean": 3.5416796038134635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6712508745040395e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14682.0, + "completions/mean_length": 6898.8671875, + "completions/mean_terminated_length": 6748.31005859375, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9633238166570663, + "epoch": 0.47746090156393745, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017788221593946218, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 436866830.0, + "reward": 0.328125, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014305114746, + "sampling/importance_sampling_ratio/min": 0.007227231748402119, + "sampling/sampling_logp_difference/max": 4.929899215698242, + "sampling/sampling_logp_difference/mean": 0.019975006580352783, + "step": 519 + }, + { + "clip_ratio/high_max": 1.8337552319280803e-05, + "clip_ratio/high_mean": 4.584388079820201e-06, + "clip_ratio/low_mean": 3.3715954828039685e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8300342453112535e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6568.8359375, + "completions/mean_terminated_length": 6333.2724609375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.9648878574371338, + "epoch": 0.47838086476540936, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021614902652800083, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 437728081.0, + "reward": 0.4140625, + "reward_std": 0.24487745761871338, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.001384100178256631, + "sampling/sampling_logp_difference/max": 6.582705020904541, + "sampling/sampling_logp_difference/mean": 0.019699109718203545, + "step": 520 + }, + { + "clip_ratio/high_max": 1.9740967672987608e-05, + "clip_ratio/high_mean": 4.935241918246902e-06, + "clip_ratio/low_mean": 5.360748559724016e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0295990477970918e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 6709.7265625, + "completions/mean_terminated_length": 6233.9423828125, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.791545994579792, + "epoch": 0.47930082796688134, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002030634554103017, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 438605294.0, + "reward": 0.5, + "reward_std": 0.2435920089483261, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999866485595703, + "sampling/importance_sampling_ratio/min": 0.00981139950454235, + "sampling/sampling_logp_difference/max": 4.624210357666016, + "sampling/sampling_logp_difference/mean": 0.01805954799056053, + "step": 521 + }, + { + "clip_ratio/high_max": 7.663652240808005e-06, + "clip_ratio/high_mean": 1.9159130602020014e-06, + "clip_ratio/low_mean": 2.266609857315416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4582011747042998e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6556.9140625, + "completions/mean_terminated_length": 6400.9287109375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.886083297431469, + "epoch": 0.48022079116835326, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014125843299552798, + "learning_rate": 1e-05, + "loss": 0.0634, + "num_tokens": 439462971.0, + "reward": 0.4921875, + "reward_std": 0.3158818185329437, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999947726726532, + "sampling/importance_sampling_ratio/min": 3.454853825246573e-08, + "sampling/sampling_logp_difference/max": 17.18090057373047, + "sampling/sampling_logp_difference/mean": 0.018355879932641983, + "step": 522 + }, + { + "clip_ratio/high_max": 9.186456281895516e-06, + "clip_ratio/high_mean": 2.296614070473879e-06, + "clip_ratio/low_mean": 3.2019113405112876e-05, + "clip_ratio/low_min": 4.055676527059404e-06, + "clip_ratio/region_mean": 3.431572758927359e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6152.4921875, + "completions/mean_terminated_length": 6071.92919921875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9536242336034775, + "epoch": 0.48114075436982523, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00171169254463166, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 440268882.0, + "reward": 0.484375, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99989914894104, + "sampling/importance_sampling_ratio/min": 0.03775034472346306, + "sampling/sampling_logp_difference/max": 3.2767605781555176, + "sampling/sampling_logp_difference/mean": 0.018800247460603714, + "step": 523 + }, + { + "clip_ratio/high_max": 8.734396942600142e-06, + "clip_ratio/high_mean": 2.1835992356500356e-06, + "clip_ratio/low_mean": 4.899439159089525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.117799059917161e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15578.0, + "completions/mean_length": 5740.796875, + "completions/mean_terminated_length": 5656.9921875, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9311753436923027, + "epoch": 0.48206071757129715, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002236112719401717, + "learning_rate": 1e-05, + "loss": 0.1033, + "num_tokens": 441020904.0, + "reward": 0.5078125, + "reward_std": 0.34353315830230713, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 0.09267321974039078, + "sampling/sampling_logp_difference/max": 2.378675699234009, + "sampling/sampling_logp_difference/mean": 0.018967337906360626, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9387059296605003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9387059296605003e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15338.0, + "completions/max_terminated_length": 15338.0, + "completions/mean_length": 7279.078125, + "completions/mean_terminated_length": 7279.078125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "entropy": 1.170717716217041, + "epoch": 0.48298068077276907, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0011770959245041013, + "learning_rate": 1e-05, + "loss": 0.0173, + "num_tokens": 441970986.0, + "reward": 0.3515625, + "reward_std": 0.2382800281047821, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999333620071411, + "sampling/importance_sampling_ratio/min": 1.1565300155780278e-05, + "sampling/sampling_logp_difference/max": 11.367501258850098, + "sampling/sampling_logp_difference/mean": 0.02134837955236435, + "step": 525 + }, + { + "clip_ratio/high_max": 1.838239040807821e-05, + "clip_ratio/high_mean": 4.595597602019552e-06, + "clip_ratio/low_mean": 3.5013973274544696e-05, + "clip_ratio/low_min": 4.0234326661447994e-06, + "clip_ratio/region_mean": 3.960957087656425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15485.0, + "completions/mean_length": 7376.796875, + "completions/mean_terminated_length": 7233.82568359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 1.0409907028079033, + "epoch": 0.48390064397424104, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002001611515879631, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 442936808.0, + "reward": 0.4453125, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999234676361084, + "sampling/importance_sampling_ratio/min": 0.003353495616465807, + "sampling/sampling_logp_difference/max": 5.697751998901367, + "sampling/sampling_logp_difference/mean": 0.02169732004404068, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.393580459487566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.393580459487566e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 6904.515625, + "completions/mean_terminated_length": 6829.8740234375, + "completions/min_length": 1159.0, + "completions/min_terminated_length": 1159.0, + "entropy": 0.9905650988221169, + "epoch": 0.48482060717571296, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023104713764041662, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 443843010.0, + "reward": 0.3515625, + "reward_std": 0.226732075214386, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.0020711510442197323, + "sampling/sampling_logp_difference/max": 6.179650783538818, + "sampling/sampling_logp_difference/mean": 0.020169749855995178, + "step": 527 + }, + { + "clip_ratio/high_max": 3.274137043263181e-06, + "clip_ratio/high_mean": 8.185342608157953e-07, + "clip_ratio/low_mean": 3.806211361734313e-05, + "clip_ratio/low_min": 4.1808816604316235e-06, + "clip_ratio/region_mean": 3.8880647935002344e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15035.0, + "completions/max_terminated_length": 15035.0, + "completions/mean_length": 6611.21875, + "completions/mean_terminated_length": 6611.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.8890361413359642, + "epoch": 0.48574057037718493, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032739758025854826, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 444709854.0, + "reward": 0.4140625, + "reward_std": 0.30327799916267395, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999891996383667, + "sampling/importance_sampling_ratio/min": 0.00029604812152683735, + "sampling/sampling_logp_difference/max": 8.124988555908203, + "sampling/sampling_logp_difference/mean": 0.018246350809931755, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.133989605430543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133989605430543e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15999.0, + "completions/mean_length": 6928.296875, + "completions/mean_terminated_length": 6853.84228515625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.9614408612251282, + "epoch": 0.48666053357865685, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018802061676979065, + "learning_rate": 1e-05, + "loss": 0.0528, + "num_tokens": 445614284.0, + "reward": 0.4921875, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999129176139832, + "sampling/importance_sampling_ratio/min": 0.02033112570643425, + "sampling/sampling_logp_difference/max": 3.895602226257324, + "sampling/sampling_logp_difference/mean": 0.019618764519691467, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9743174675568298e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9743174675568298e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16194.0, + "completions/mean_length": 7946.8671875, + "completions/mean_terminated_length": 7812.94482421875, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "entropy": 0.9987246319651604, + "epoch": 0.48758049678012877, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002164191100746393, + "learning_rate": 1e-05, + "loss": 0.0192, + "num_tokens": 446649731.0, + "reward": 0.453125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999844431877136, + "sampling/importance_sampling_ratio/min": 0.0018519347067922354, + "sampling/sampling_logp_difference/max": 6.291524410247803, + "sampling/sampling_logp_difference/mean": 0.020579926669597626, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4596658477094024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4596658477094024e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14446.0, + "completions/mean_length": 6763.53125, + "completions/mean_terminated_length": 6532.64013671875, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "entropy": 0.9593042582273483, + "epoch": 0.48850045998160074, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002090689493343234, + "learning_rate": 1e-05, + "loss": 0.0375, + "num_tokens": 447536311.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999126195907593, + "sampling/importance_sampling_ratio/min": 0.014640630222856998, + "sampling/sampling_logp_difference/max": 4.223954677581787, + "sampling/sampling_logp_difference/mean": 0.019683964550495148, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.527509309402376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.527509309402376e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15805.0, + "completions/mean_length": 7394.40625, + "completions/mean_terminated_length": 7323.6220703125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0184528306126595, + "epoch": 0.48942042318307266, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002562359906733036, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 448505707.0, + "reward": 0.2578125, + "reward_std": 0.17123225331306458, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560713768005, + "sampling/importance_sampling_ratio/min": 0.0002687747764866799, + "sampling/sampling_logp_difference/max": 8.221636772155762, + "sampling/sampling_logp_difference/mean": 0.020989736542105675, + "step": 532 + }, + { + "clip_ratio/high_max": 4.772085048898589e-06, + "clip_ratio/high_mean": 1.1930212622246472e-06, + "clip_ratio/low_mean": 2.0207754744205886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.140077623380421e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 7196.328125, + "completions/mean_terminated_length": 6822.84521484375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 1.0106298848986626, + "epoch": 0.49034038638454464, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017445285338908434, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 449443709.0, + "reward": 0.296875, + "reward_std": 0.21436558663845062, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999077320098877, + "sampling/importance_sampling_ratio/min": 0.0012854337692260742, + "sampling/sampling_logp_difference/max": 6.656659126281738, + "sampling/sampling_logp_difference/mean": 0.021059826016426086, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.0835892605173285e-05, + "clip_ratio/low_min": 3.619411700128694e-06, + "clip_ratio/region_mean": 4.0835892605173285e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 7418.3515625, + "completions/mean_terminated_length": 7203.17626953125, + "completions/min_length": 1445.0, + "completions/min_terminated_length": 1445.0, + "entropy": 1.002836562693119, + "epoch": 0.49126034958601655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015701872762292624, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 450412866.0, + "reward": 0.328125, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527335166931, + "sampling/importance_sampling_ratio/min": 8.191307279048488e-05, + "sampling/sampling_logp_difference/max": 9.409852027893066, + "sampling/sampling_logp_difference/mean": 0.020907817408442497, + "step": 534 + }, + { + "clip_ratio/high_max": 1.0691738907553372e-05, + "clip_ratio/high_mean": 4.761823504395579e-06, + "clip_ratio/low_mean": 9.472978547364619e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.949160914857202e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14089.0, + "completions/mean_length": 7007.109375, + "completions/mean_terminated_length": 6782.064453125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.9748141467571259, + "epoch": 0.4921803127874885, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003912154585123062, + "learning_rate": 1e-05, + "loss": 0.055, + "num_tokens": 451331560.0, + "reward": 0.453125, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994460344314575, + "sampling/importance_sampling_ratio/min": 1.125945416902141e-07, + "sampling/sampling_logp_difference/max": 15.999472618103027, + "sampling/sampling_logp_difference/mean": 0.026503996923565865, + "step": 535 + }, + { + "clip_ratio/high_max": 1.5173390238487627e-05, + "clip_ratio/high_mean": 3.793347559621907e-06, + "clip_ratio/low_mean": 3.870478303724667e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.249813082424225e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15336.0, + "completions/mean_length": 6605.5, + "completions/mean_terminated_length": 6290.064453125, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "entropy": 0.9742915332317352, + "epoch": 0.49310027598896045, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0029959778767079115, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 452197568.0, + "reward": 0.46875, + "reward_std": 0.3180162310600281, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998653531074524, + "sampling/importance_sampling_ratio/min": 0.0005176665727049112, + "sampling/sampling_logp_difference/max": 7.566179275512695, + "sampling/sampling_logp_difference/mean": 0.019547434523701668, + "step": 536 + }, + { + "clip_ratio/high_max": 4.233987056068145e-06, + "clip_ratio/high_mean": 1.0584967640170362e-06, + "clip_ratio/low_mean": 3.348358245602867e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.454207922004571e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 6091.828125, + "completions/mean_terminated_length": 6010.78759765625, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 0.9893068373203278, + "epoch": 0.49402023919043236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027553467079997063, + "learning_rate": 1e-05, + "loss": 0.064, + "num_tokens": 452995762.0, + "reward": 0.3671875, + "reward_std": 0.22437798976898193, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000625848770142, + "sampling/importance_sampling_ratio/min": 1.8432530168865924e-08, + "sampling/sampling_logp_difference/max": 17.80914878845215, + "sampling/sampling_logp_difference/mean": 0.02093922719359398, + "step": 537 + }, + { + "clip_ratio/high_max": 2.9927550940556102e-05, + "clip_ratio/high_mean": 7.481887735139026e-06, + "clip_ratio/low_mean": 5.346296995867306e-05, + "clip_ratio/low_min": 5.110593065182911e-06, + "clip_ratio/region_mean": 6.094485820540285e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16080.0, + "completions/mean_length": 6864.578125, + "completions/mean_terminated_length": 6789.6220703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 1.005393773317337, + "epoch": 0.49494020239190434, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002985693048685789, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 453896300.0, + "reward": 0.3828125, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999870777130127, + "sampling/importance_sampling_ratio/min": 1.8929262296296656e-05, + "sampling/sampling_logp_difference/max": 10.874801635742188, + "sampling/sampling_logp_difference/mean": 0.019800683483481407, + "step": 538 + }, + { + "clip_ratio/high_max": 1.2092638826288749e-05, + "clip_ratio/high_mean": 4.037869075546041e-06, + "clip_ratio/low_mean": 2.9533587621699553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3571456697245594e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14651.0, + "completions/max_terminated_length": 14651.0, + "completions/mean_length": 5828.125, + "completions/mean_terminated_length": 5828.125, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.909324087202549, + "epoch": 0.49586016559337626, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003145795315504074, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 454661564.0, + "reward": 0.359375, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999277591705322, + "sampling/importance_sampling_ratio/min": 5.3384183047455736e-06, + "sampling/sampling_logp_difference/max": 12.140581130981445, + "sampling/sampling_logp_difference/mean": 0.019065624102950096, + "step": 539 + }, + { + "clip_ratio/high_max": 2.344680183341552e-05, + "clip_ratio/high_mean": 5.86170045835388e-06, + "clip_ratio/low_mean": 4.5576647153211525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.143834823684301e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 6213.4140625, + "completions/mean_terminated_length": 6051.9765625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.9570266529917717, + "epoch": 0.49678012879484823, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026711132377386093, + "learning_rate": 1e-05, + "loss": 0.116, + "num_tokens": 455477577.0, + "reward": 0.4296875, + "reward_std": 0.28930407762527466, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.00041241716826334596, + "sampling/sampling_logp_difference/max": 7.793475151062012, + "sampling/sampling_logp_difference/mean": 0.01995767280459404, + "step": 540 + }, + { + "clip_ratio/high_max": 1.5261470707628177e-05, + "clip_ratio/high_mean": 3.815367676907044e-06, + "clip_ratio/low_mean": 3.6731302770931507e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.054667033415171e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15132.0, + "completions/mean_length": 7125.4140625, + "completions/mean_terminated_length": 7052.51171875, + "completions/min_length": 1374.0, + "completions/min_terminated_length": 1374.0, + "entropy": 0.9259644895792007, + "epoch": 0.49770009199632015, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030442574061453342, + "learning_rate": 1e-05, + "loss": 0.1227, + "num_tokens": 456408966.0, + "reward": 0.484375, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999846816062927, + "sampling/importance_sampling_ratio/min": 0.00023056140344124287, + "sampling/sampling_logp_difference/max": 8.374993324279785, + "sampling/sampling_logp_difference/mean": 0.020200349390506744, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.665321148422663e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.665321148422663e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15909.0, + "completions/mean_length": 6472.1640625, + "completions/mean_terminated_length": 6314.83349609375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.8606229647994041, + "epoch": 0.49862005519779207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002203581389039755, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 457257011.0, + "reward": 0.453125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998785853385925, + "sampling/importance_sampling_ratio/min": 8.579161658417434e-05, + "sampling/sampling_logp_difference/max": 9.3635892868042, + "sampling/sampling_logp_difference/mean": 0.018575064837932587, + "step": 542 + }, + { + "clip_ratio/high_max": 1.1763763723138254e-05, + "clip_ratio/high_mean": 2.9409409307845635e-06, + "clip_ratio/low_mean": 2.8100045369683357e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.104098641415476e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16072.0, + "completions/max_terminated_length": 16072.0, + "completions/mean_length": 7154.0, + "completions/mean_terminated_length": 7154.0, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "entropy": 0.977513425052166, + "epoch": 0.49954001839926404, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001689116470515728, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 458196355.0, + "reward": 0.40625, + "reward_std": 0.18543371558189392, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999965250492096, + "sampling/importance_sampling_ratio/min": 0.00029606535099446774, + "sampling/sampling_logp_difference/max": 8.124930381774902, + "sampling/sampling_logp_difference/mean": 0.0198836512863636, + "step": 543 + }, + { + "clip_ratio/high_max": 1.1758888149415725e-05, + "clip_ratio/high_mean": 2.9397220373539312e-06, + "clip_ratio/low_mean": 4.075526112501393e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369498378764547e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16003.0, + "completions/mean_length": 6878.7265625, + "completions/mean_terminated_length": 6727.849609375, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.9291028156876564, + "epoch": 0.500459981600736, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001968112075701356, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 459095320.0, + "reward": 0.4609375, + "reward_std": 0.30274122953414917, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00014571755309589207, + "sampling/sampling_logp_difference/max": 8.833840370178223, + "sampling/sampling_logp_difference/mean": 0.019927173852920532, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.1461796147123096e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.1461796147123096e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15599.0, + "completions/mean_length": 7187.96875, + "completions/mean_terminated_length": 7042.00048828125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 1.1720879971981049, + "epoch": 0.5013799448022079, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002588641829788685, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 460042660.0, + "reward": 0.2265625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998769760131836, + "sampling/importance_sampling_ratio/min": 4.738242012081173e-07, + "sampling/sampling_logp_difference/max": 14.562429428100586, + "sampling/sampling_logp_difference/mean": 0.021826796233654022, + "step": 545 + }, + { + "clip_ratio/high_max": 1.55452166836767e-05, + "clip_ratio/high_mean": 3.886304170919175e-06, + "clip_ratio/low_mean": 4.735719005566352e-05, + "clip_ratio/low_min": 4.235134838381782e-06, + "clip_ratio/region_mean": 5.1243494908703724e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16019.0, + "completions/mean_length": 6278.078125, + "completions/mean_terminated_length": 6035.5361328125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.8143310993909836, + "epoch": 0.5022999080036799, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002047745743766427, + "learning_rate": 1e-05, + "loss": 0.064, + "num_tokens": 460864862.0, + "reward": 0.625, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999461770057678, + "sampling/importance_sampling_ratio/min": 0.011549573391675949, + "sampling/sampling_logp_difference/max": 4.461106777191162, + "sampling/sampling_logp_difference/mean": 0.017143042758107185, + "step": 546 + }, + { + "clip_ratio/high_max": 2.9079910746077076e-06, + "clip_ratio/high_mean": 7.269977686519269e-07, + "clip_ratio/low_mean": 6.497366200619581e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.224363969271508e-06, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13575.0, + "completions/mean_length": 5664.8828125, + "completions/mean_terminated_length": 5494.73828125, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "entropy": 0.9489249512553215, + "epoch": 0.5032198712051518, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002950560301542282, + "learning_rate": 1e-05, + "loss": 0.0867, + "num_tokens": 461608471.0, + "reward": 0.625, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999043345451355, + "sampling/importance_sampling_ratio/min": 1.6701715139788575e-05, + "sampling/sampling_logp_difference/max": 10.999999046325684, + "sampling/sampling_logp_difference/mean": 0.019181005656719208, + "step": 547 + }, + { + "clip_ratio/high_max": 1.2411757779773325e-05, + "clip_ratio/high_mean": 3.102939444943331e-06, + "clip_ratio/low_mean": 2.458288531670405e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7685824761647382e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16223.0, + "completions/mean_length": 6914.4375, + "completions/mean_terminated_length": 6839.8740234375, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "entropy": 0.9416745603084564, + "epoch": 0.5041398344066237, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013792186509817839, + "learning_rate": 1e-05, + "loss": 0.0112, + "num_tokens": 462511519.0, + "reward": 0.3671875, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999217391014099, + "sampling/importance_sampling_ratio/min": 4.006533345091157e-05, + "sampling/sampling_logp_difference/max": 10.124999046325684, + "sampling/sampling_logp_difference/mean": 0.01967109739780426, + "step": 548 + }, + { + "clip_ratio/high_max": 7.5066598128614714e-06, + "clip_ratio/high_mean": 1.8766649532153679e-06, + "clip_ratio/low_mean": 3.393825062403266e-05, + "clip_ratio/low_min": 3.3629271456447896e-06, + "clip_ratio/region_mean": 3.581491563409145e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 7343.296875, + "completions/mean_terminated_length": 7051.6611328125, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "entropy": 0.845381110906601, + "epoch": 0.5050597976080957, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028722358401864767, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 463472581.0, + "reward": 0.3984375, + "reward_std": 0.2880156934261322, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880194664001, + "sampling/importance_sampling_ratio/min": 1.5694884496042505e-05, + "sampling/sampling_logp_difference/max": 11.062175750732422, + "sampling/sampling_logp_difference/mean": 0.018903033807873726, + "step": 549 + }, + { + "clip_ratio/high_max": 1.6802483287392533e-05, + "clip_ratio/high_mean": 5.505368051217374e-06, + "clip_ratio/low_mean": 2.8057194754183e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.356256252118328e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13716.0, + "completions/mean_length": 6022.4375, + "completions/mean_terminated_length": 5940.8505859375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.9279188066720963, + "epoch": 0.5059797608095676, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002812078921124339, + "learning_rate": 1e-05, + "loss": 0.0074, + "num_tokens": 464263709.0, + "reward": 0.421875, + "reward_std": 0.26120057702064514, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000264644622803, + "sampling/importance_sampling_ratio/min": 0.0008089813054539263, + "sampling/sampling_logp_difference/max": 7.119734764099121, + "sampling/sampling_logp_difference/mean": 0.01863965392112732, + "step": 550 + }, + { + "clip_ratio/high_max": 1.799457299966889e-05, + "clip_ratio/high_mean": 5.5325897960756265e-06, + "clip_ratio/low_mean": 3.587696073736879e-05, + "clip_ratio/low_min": 2.965106659758021e-06, + "clip_ratio/region_mean": 4.140955002185365e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16333.0, + "completions/mean_length": 6888.6328125, + "completions/mean_terminated_length": 6813.8662109375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 1.0720202773809433, + "epoch": 0.5068997240110396, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001776764984242618, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 465167502.0, + "reward": 0.3203125, + "reward_std": 0.2961437702178955, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945342540741, + "sampling/importance_sampling_ratio/min": 0.0013267829781398177, + "sampling/sampling_logp_difference/max": 6.624998092651367, + "sampling/sampling_logp_difference/mean": 0.02100517973303795, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.568914848983695e-05, + "clip_ratio/low_min": 3.652834493550472e-06, + "clip_ratio/region_mean": 3.568914848983695e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14283.0, + "completions/mean_length": 6626.7578125, + "completions/mean_terminated_length": 6549.92919921875, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9632527679204941, + "epoch": 0.5078196872125115, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016460138140246272, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 466034535.0, + "reward": 0.5, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000152587890625, + "sampling/importance_sampling_ratio/min": 0.0002774179738480598, + "sampling/sampling_logp_difference/max": 8.189985275268555, + "sampling/sampling_logp_difference/mean": 0.020494937896728516, + "step": 552 + }, + { + "clip_ratio/high_max": 9.810846677282825e-06, + "clip_ratio/high_mean": 2.4527116693207063e-06, + "clip_ratio/low_mean": 2.4154636378170835e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.660734804749154e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16169.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 6685.484375, + "completions/mean_terminated_length": 6685.484375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.9092860966920853, + "epoch": 0.5087396504139834, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019802958704531193, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 466911965.0, + "reward": 0.4609375, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 0.0020434472244232893, + "sampling/sampling_logp_difference/max": 6.193117141723633, + "sampling/sampling_logp_difference/mean": 0.02000512182712555, + "step": 553 + }, + { + "clip_ratio/high_max": 3.24397274198418e-06, + "clip_ratio/high_mean": 8.10993185496045e-07, + "clip_ratio/low_mean": 2.4120176362885104e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.493116954838115e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7105.1171875, + "completions/mean_terminated_length": 7032.05517578125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 1.046683594584465, + "epoch": 0.5096596136154554, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002490658313035965, + "learning_rate": 1e-05, + "loss": 0.0077, + "num_tokens": 467844820.0, + "reward": 0.2578125, + "reward_std": 0.17123225331306458, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999809265136719, + "sampling/importance_sampling_ratio/min": 7.140394586713228e-07, + "sampling/sampling_logp_difference/max": 14.152327537536621, + "sampling/sampling_logp_difference/mean": 0.020726388320326805, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0303147582344536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0303147582344536e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15969.0, + "completions/max_terminated_length": 15969.0, + "completions/mean_length": 6806.5546875, + "completions/mean_terminated_length": 6806.5546875, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "entropy": 0.9514358267188072, + "epoch": 0.5105795768169273, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002886313945055008, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 468732451.0, + "reward": 0.3203125, + "reward_std": 0.23250603675842285, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999695420265198, + "sampling/importance_sampling_ratio/min": 3.148883251924417e-06, + "sampling/sampling_logp_difference/max": 12.668462753295898, + "sampling/sampling_logp_difference/mean": 0.019308820366859436, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.485187078742456e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.485187078742456e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16075.0, + "completions/mean_length": 6238.546875, + "completions/mean_terminated_length": 5995.05615234375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 0.9408878460526466, + "epoch": 0.5114995400183993, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002731110667809844, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 469551145.0, + "reward": 0.3671875, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999334812164307, + "sampling/importance_sampling_ratio/min": 0.000488168589072302, + "sampling/sampling_logp_difference/max": 7.624849796295166, + "sampling/sampling_logp_difference/mean": 0.01883235014975071, + "step": 556 + }, + { + "clip_ratio/high_max": 3.5477096389513463e-06, + "clip_ratio/high_mean": 8.869274097378366e-07, + "clip_ratio/low_mean": 2.5422534008612274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.630946141835011e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 7354.5, + "completions/mean_terminated_length": 7283.4013671875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.9548593312501907, + "epoch": 0.5124195032198712, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022831051610410213, + "learning_rate": 1e-05, + "loss": 0.004, + "num_tokens": 470510305.0, + "reward": 0.4609375, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999933123588562, + "sampling/importance_sampling_ratio/min": 0.00029948100564070046, + "sampling/sampling_logp_difference/max": 8.113459587097168, + "sampling/sampling_logp_difference/mean": 0.020626772195100784, + "step": 557 + }, + { + "clip_ratio/high_max": 1.0478707963557099e-05, + "clip_ratio/high_mean": 2.6196769908892747e-06, + "clip_ratio/low_mean": 4.646405352559668e-05, + "clip_ratio/low_min": 9.308073458669242e-06, + "clip_ratio/region_mean": 4.908373023226886e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16222.0, + "completions/mean_length": 7481.421875, + "completions/mean_terminated_length": 7119.5283203125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.9302244186401367, + "epoch": 0.5133394664213431, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015396618982777, + "learning_rate": 1e-05, + "loss": 0.0944, + "num_tokens": 471486799.0, + "reward": 0.34375, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397993087769, + "sampling/importance_sampling_ratio/min": 0.0004175819631200284, + "sampling/sampling_logp_difference/max": 7.78102970123291, + "sampling/sampling_logp_difference/mean": 0.019920824095606804, + "step": 558 + }, + { + "clip_ratio/high_max": 1.2743131946990616e-05, + "clip_ratio/high_mean": 3.185782986747654e-06, + "clip_ratio/low_mean": 3.139938735330361e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.458517039689468e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 7333.9375, + "completions/mean_terminated_length": 7042.0, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0290198475122452, + "epoch": 0.5142594296228151, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002122553065419197, + "learning_rate": 1e-05, + "loss": 0.0653, + "num_tokens": 472443991.0, + "reward": 0.359375, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000252723693848, + "sampling/importance_sampling_ratio/min": 0.00011467799777165055, + "sampling/sampling_logp_difference/max": 9.073382377624512, + "sampling/sampling_logp_difference/mean": 0.020558707416057587, + "step": 559 + }, + { + "clip_ratio/high_max": 2.856805417650321e-05, + "clip_ratio/high_mean": 7.142013544125803e-06, + "clip_ratio/low_mean": 4.716298451512557e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.430499885505924e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16248.0, + "completions/mean_length": 6908.953125, + "completions/mean_terminated_length": 6681.55224609375, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "entropy": 0.9942271336913109, + "epoch": 0.515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017197602428495884, + "learning_rate": 1e-05, + "loss": 0.1309, + "num_tokens": 473346577.0, + "reward": 0.421875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999131560325623, + "sampling/importance_sampling_ratio/min": 0.00016969948774203658, + "sampling/sampling_logp_difference/max": 8.68148136138916, + "sampling/sampling_logp_difference/mean": 0.019906114786863327, + "step": 560 + }, + { + "clip_ratio/high_max": 2.4387230496358825e-05, + "clip_ratio/high_mean": 7.2725478048596415e-06, + "clip_ratio/low_mean": 3.3024165190909116e-05, + "clip_ratio/low_min": 2.9529187486332376e-06, + "clip_ratio/region_mean": 4.029671254102141e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7543.046875, + "completions/mean_terminated_length": 7183.658203125, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.973315916955471, + "epoch": 0.516099356025759, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001618197187781334, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 474330663.0, + "reward": 0.28125, + "reward_std": 0.28353503346443176, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999313950538635, + "sampling/importance_sampling_ratio/min": 2.1410157557966158e-07, + "sampling/sampling_logp_difference/max": 15.356815338134766, + "sampling/sampling_logp_difference/mean": 0.019991599023342133, + "step": 561 + }, + { + "clip_ratio/high_max": 1.8185269482273725e-05, + "clip_ratio/high_mean": 4.546317370568431e-06, + "clip_ratio/low_mean": 5.2758662491214636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7304980941808026e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15773.0, + "completions/mean_length": 7136.375, + "completions/mean_terminated_length": 6838.064453125, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "entropy": 0.8573452606797218, + "epoch": 0.5170193192272309, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025291196070611477, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 475262071.0, + "reward": 0.453125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999455213546753, + "sampling/importance_sampling_ratio/min": 5.8296889619668946e-05, + "sampling/sampling_logp_difference/max": 9.749961853027344, + "sampling/sampling_logp_difference/mean": 0.018726464360952377, + "step": 562 + }, + { + "clip_ratio/high_max": 1.9233400280427304e-05, + "clip_ratio/high_mean": 4.808350070106826e-06, + "clip_ratio/low_mean": 4.3801222432193754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.860957244545716e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 6538.765625, + "completions/mean_terminated_length": 6138.552734375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8312613591551781, + "epoch": 0.5179392824287029, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018195402808487415, + "learning_rate": 1e-05, + "loss": 0.1266, + "num_tokens": 476119385.0, + "reward": 0.5078125, + "reward_std": 0.3674348294734955, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999343156814575, + "sampling/importance_sampling_ratio/min": 0.005875314120203257, + "sampling/sampling_logp_difference/max": 5.136995792388916, + "sampling/sampling_logp_difference/mean": 0.018957480788230896, + "step": 563 + }, + { + "clip_ratio/high_max": 1.4299099348136224e-05, + "clip_ratio/high_mean": 3.574774837034056e-06, + "clip_ratio/low_mean": 2.9377598366409075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.295237320344313e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6692.078125, + "completions/mean_terminated_length": 5870.72900390625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.943247564136982, + "epoch": 0.5188592456301748, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001623075339011848, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 476995139.0, + "reward": 0.53125, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999014139175415, + "sampling/importance_sampling_ratio/min": 0.0003255821648053825, + "sampling/sampling_logp_difference/max": 8.029895782470703, + "sampling/sampling_logp_difference/mean": 0.019327864050865173, + "step": 564 + }, + { + "clip_ratio/high_max": 2.547848680478637e-06, + "clip_ratio/high_mean": 6.369621701196593e-07, + "clip_ratio/low_mean": 5.479312403622316e-05, + "clip_ratio/low_min": 8.624037718618638e-06, + "clip_ratio/region_mean": 5.543008592212573e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 7118.40625, + "completions/mean_terminated_length": 6896.0322265625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 1.051003873348236, + "epoch": 0.5197792088316467, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034032040275633335, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 477926583.0, + "reward": 0.359375, + "reward_std": 0.30115145444869995, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.00037551531568169594, + "sampling/sampling_logp_difference/max": 7.887211322784424, + "sampling/sampling_logp_difference/mean": 0.021631836891174316, + "step": 565 + }, + { + "clip_ratio/high_max": 3.823331553576281e-06, + "clip_ratio/high_mean": 9.558328883940703e-07, + "clip_ratio/low_mean": 1.506989860899921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.602573161108012e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 7555.8515625, + "completions/mean_terminated_length": 7415.72265625, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.9771487265825272, + "epoch": 0.5206991720331187, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014035169733688235, + "learning_rate": 1e-05, + "loss": 0.0089, + "num_tokens": 478914724.0, + "reward": 0.1875, + "reward_std": 0.19673939049243927, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999145865440369, + "sampling/importance_sampling_ratio/min": 0.0017069041496142745, + "sampling/sampling_logp_difference/max": 6.373074054718018, + "sampling/sampling_logp_difference/mean": 0.020011281594634056, + "step": 566 + }, + { + "clip_ratio/high_max": 4.262138645572122e-06, + "clip_ratio/high_mean": 2.0894199224130716e-06, + "clip_ratio/low_mean": 2.9273888458192232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1363308380605304e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 6505.671875, + "completions/mean_terminated_length": 6019.85205078125, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 0.9913810566067696, + "epoch": 0.5216191352345906, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0012457151897251606, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 479766874.0, + "reward": 0.3984375, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999585151672363, + "sampling/importance_sampling_ratio/min": 5.239284206481898e-08, + "sampling/sampling_logp_difference/max": 16.764495849609375, + "sampling/sampling_logp_difference/mean": 0.01945749670267105, + "step": 567 + }, + { + "clip_ratio/high_max": 4.419772267283406e-06, + "clip_ratio/high_mean": 1.1049430668208515e-06, + "clip_ratio/low_mean": 3.3968740126510966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.507368319333182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15589.0, + "completions/max_terminated_length": 15589.0, + "completions/mean_length": 6709.96875, + "completions/mean_terminated_length": 6709.96875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 1.053658738732338, + "epoch": 0.5225390984360626, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002912909025326371, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 480644782.0, + "reward": 0.4140625, + "reward_std": 0.2041109800338745, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000216960906982, + "sampling/importance_sampling_ratio/min": 0.00010272916551912203, + "sampling/sampling_logp_difference/max": 9.183414459228516, + "sampling/sampling_logp_difference/mean": 0.020628605037927628, + "step": 568 + }, + { + "clip_ratio/high_max": 1.5635781892342493e-05, + "clip_ratio/high_mean": 5.148336185811786e-06, + "clip_ratio/low_mean": 7.926051148388069e-05, + "clip_ratio/low_min": 9.047379990079207e-06, + "clip_ratio/region_mean": 8.440884812443983e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15677.0, + "completions/max_terminated_length": 15677.0, + "completions/mean_length": 6712.8515625, + "completions/mean_terminated_length": 6712.8515625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "entropy": 0.9288468211889267, + "epoch": 0.5234590616375345, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028935675509274006, + "learning_rate": 1e-05, + "loss": 0.0293, + "num_tokens": 481525875.0, + "reward": 0.328125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999656677246094, + "sampling/importance_sampling_ratio/min": 0.0003157128521706909, + "sampling/sampling_logp_difference/max": 8.060677528381348, + "sampling/sampling_logp_difference/mean": 0.0201251357793808, + "step": 569 + }, + { + "clip_ratio/high_max": 1.1007121202055714e-05, + "clip_ratio/high_mean": 2.7517803005139285e-06, + "clip_ratio/low_mean": 4.98413718332813e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2593152645386e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16247.0, + "completions/mean_length": 7452.125, + "completions/mean_terminated_length": 7164.0, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.8201636075973511, + "epoch": 0.5243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014447550056502223, + "learning_rate": 1e-05, + "loss": 0.1068, + "num_tokens": 482498539.0, + "reward": 0.25, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352097511292, + "sampling/importance_sampling_ratio/min": 0.0008213221444748342, + "sampling/sampling_logp_difference/max": 7.104595184326172, + "sampling/sampling_logp_difference/mean": 0.018142810091376305, + "step": 570 + }, + { + "clip_ratio/high_max": 3.4893782867584378e-06, + "clip_ratio/high_mean": 8.723445716896094e-07, + "clip_ratio/low_mean": 2.5241818775612046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6114163347301655e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16242.0, + "completions/mean_length": 5997.6484375, + "completions/mean_terminated_length": 5915.8662109375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "entropy": 0.9595593363046646, + "epoch": 0.5252989880404784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013929647393524647, + "learning_rate": 1e-05, + "loss": -0.0018, + "num_tokens": 483286590.0, + "reward": 0.421875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000566244125366, + "sampling/importance_sampling_ratio/min": 6.860717985546216e-05, + "sampling/sampling_logp_difference/max": 9.587113380432129, + "sampling/sampling_logp_difference/mean": 0.019294174388051033, + "step": 571 + }, + { + "clip_ratio/high_max": 1.2741817272399203e-05, + "clip_ratio/high_mean": 3.1854543180998007e-06, + "clip_ratio/low_mean": 3.2705364901630674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.589081939026073e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15962.0, + "completions/mean_length": 6706.4140625, + "completions/mean_terminated_length": 6474.15234375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9320398196578026, + "epoch": 0.5262189512419503, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020693838596343994, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 484164003.0, + "reward": 0.4296875, + "reward_std": 0.30744946002960205, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999852180480957, + "sampling/importance_sampling_ratio/min": 0.011049352586269379, + "sampling/sampling_logp_difference/max": 4.505383491516113, + "sampling/sampling_logp_difference/mean": 0.01968679018318653, + "step": 572 + }, + { + "clip_ratio/high_max": 1.783004472599714e-05, + "clip_ratio/high_mean": 4.457511181499285e-06, + "clip_ratio/low_mean": 2.067615122314237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5133662290954817e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15636.0, + "completions/mean_length": 5317.96875, + "completions/mean_terminated_length": 5230.83447265625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.891069769859314, + "epoch": 0.5271389144434223, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004261080641299486, + "learning_rate": 1e-05, + "loss": 0.0528, + "num_tokens": 484864799.0, + "reward": 0.5234375, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999099969863892, + "sampling/importance_sampling_ratio/min": 0.00014285604993347079, + "sampling/sampling_logp_difference/max": 8.853672981262207, + "sampling/sampling_logp_difference/mean": 0.01876065693795681, + "step": 573 + }, + { + "clip_ratio/high_max": 6.954531272640452e-06, + "clip_ratio/high_mean": 1.738632818160113e-06, + "clip_ratio/low_mean": 4.1548010585756856e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.328664340391697e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6978.7890625, + "completions/mean_terminated_length": 6596.46337890625, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "entropy": 0.9322286397218704, + "epoch": 0.5280588776448942, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013973438180983067, + "learning_rate": 1e-05, + "loss": 0.0396, + "num_tokens": 485779676.0, + "reward": 0.3125, + "reward_std": 0.2675113081932068, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999111890792847, + "sampling/importance_sampling_ratio/min": 0.00024690330610610545, + "sampling/sampling_logp_difference/max": 8.306513786315918, + "sampling/sampling_logp_difference/mean": 0.019345812499523163, + "step": 574 + }, + { + "clip_ratio/high_max": 1.4024310985405464e-05, + "clip_ratio/high_mean": 3.506077746351366e-06, + "clip_ratio/low_mean": 3.8480168882415455e-05, + "clip_ratio/low_min": 8.625057944300352e-06, + "clip_ratio/region_mean": 4.198624606033263e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16178.0, + "completions/mean_length": 6046.4921875, + "completions/mean_terminated_length": 5965.09423828125, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "entropy": 1.0245087146759033, + "epoch": 0.5289788408463661, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015273626195266843, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 486574779.0, + "reward": 0.3046875, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998648166656494, + "sampling/importance_sampling_ratio/min": 0.00043810487841255963, + "sampling/sampling_logp_difference/max": 7.7330522537231445, + "sampling/sampling_logp_difference/mean": 0.01977401226758957, + "step": 575 + }, + { + "clip_ratio/high_max": 1.1012245522579178e-05, + "clip_ratio/high_mean": 2.7530613806447946e-06, + "clip_ratio/low_mean": 2.9637111538249883e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239017382838938e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16086.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 5987.0859375, + "completions/mean_terminated_length": 5987.0859375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.9373713582754135, + "epoch": 0.5298988040478381, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003076995024457574, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 487366590.0, + "reward": 0.4453125, + "reward_std": 0.24830511212348938, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000369548797607, + "sampling/importance_sampling_ratio/min": 0.0004714882234111428, + "sampling/sampling_logp_difference/max": 7.659616470336914, + "sampling/sampling_logp_difference/mean": 0.018766682595014572, + "step": 576 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 487366590, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/zero_to_fp32.py b/dapo_lorafa_20251202_173337/checkpoint-576/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/README.md b/dapo_milora_plus_20251201_131939/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d1f03ef0451784218b16e8ef0ad1a9caf440e512 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/README.md @@ -0,0 +1,68 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: transformers +model_name: dapo_milora_plus_20251201_131939 +tags: +- generated_from_trainer +- trl +- grpo +licence: license +--- + +# Model Card for dapo_milora_plus_20251201_131939 + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/56v55mci) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.25.0 +- Transformers: 4.57.1 +- Pytorch: 2.8.0 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/README.md b/dapo_milora_plus_20251201_131939/checkpoint-128/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-128/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-128/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/latest b/dapo_milora_plus_20251201_131939/checkpoint-128/latest new file mode 100644 index 0000000000000000000000000000000000000000..b4db7fb020d9ef75e52048bf0cde7481e3ef9351 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/latest @@ -0,0 +1 @@ +global_step128 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-128/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-128/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f9752b63fab19d643d532ada018b0f2f19494a35 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/trainer_state.json @@ -0,0 +1,4002 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11775528978840846, + "eval_steps": 500, + "global_step": 128, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 102643751, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-128/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-128/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-128/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/README.md b/dapo_milora_plus_20251201_131939/checkpoint-192/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-192/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-192/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/latest b/dapo_milora_plus_20251201_131939/checkpoint-192/latest new file mode 100644 index 0000000000000000000000000000000000000000..36721df7ef9c6f050f37be6e76b3d130ed5cbfc7 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/latest @@ -0,0 +1 @@ +global_step192 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-192/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-192/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-192/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b9d1b140006d37df9911f8e79bb9a416d4e546e2 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/trainer_state.json @@ -0,0 +1,5986 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1766329346826127, + "eval_steps": 500, + "global_step": 192, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 161103384, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-192/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-192/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-192/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/README.md b/dapo_milora_plus_20251201_131939/checkpoint-256/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-256/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-256/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/latest b/dapo_milora_plus_20251201_131939/checkpoint-256/latest new file mode 100644 index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/latest @@ -0,0 +1 @@ +global_step256 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-256/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-256/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-256/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2304c1b8b835a380d86c49270097508c0388c771 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/trainer_state.json @@ -0,0 +1,7970 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.23551057957681693, + "eval_steps": 500, + "global_step": 256, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 219459140, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-256/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-256/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-256/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/README.md b/dapo_milora_plus_20251201_131939/checkpoint-320/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-320/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-320/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/latest b/dapo_milora_plus_20251201_131939/checkpoint-320/latest new file mode 100644 index 0000000000000000000000000000000000000000..9d535587efdab3121736d8095481e4143f000213 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/latest @@ -0,0 +1 @@ +global_step320 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-320/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-320/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ebf99131405e095aadde6f9bf4b506f4e32b67d3 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/trainer_state.json @@ -0,0 +1,9954 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.29438822447102114, + "eval_steps": 500, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + }, + { + "clip_ratio/high_max": 6.87833608026267e-06, + "clip_ratio/high_mean": 2.9462287329806713e-06, + "clip_ratio/low_mean": 5.435333650893881e-05, + "clip_ratio/low_min": 5.33937054569833e-06, + "clip_ratio/region_mean": 5.729956546929316e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 6448.0078125, + "completions/mean_terminated_length": 6369.771484375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9546648040413857, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004310046322643757, + "learning_rate": 1e-05, + "loss": 0.1082, + "num_tokens": 220304605.0, + "reward": 0.5703125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 0.0001234127557836473, + "sampling/sampling_logp_difference/max": 8.99997615814209, + "sampling/sampling_logp_difference/mean": 0.020253397524356842, + "step": 257 + }, + { + "clip_ratio/high_max": 6.196094091137638e-06, + "clip_ratio/high_mean": 1.5490235227844096e-06, + "clip_ratio/low_mean": 2.5416685957679874e-05, + "clip_ratio/low_min": 5.5736391004757024e-06, + "clip_ratio/region_mean": 2.696570959415112e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 7457.6484375, + "completions/mean_terminated_length": 6941.24755859375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "entropy": 0.8182889074087143, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026646999176591635, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 221281968.0, + "reward": 0.4453125, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173283576965, + "sampling/importance_sampling_ratio/min": 2.902353571698768e-06, + "sampling/sampling_logp_difference/max": 12.749988555908203, + "sampling/sampling_logp_difference/mean": 0.019208962097764015, + "step": 258 + }, + { + "clip_ratio/high_max": 1.6189535017474554e-05, + "clip_ratio/high_mean": 4.047383754368639e-06, + "clip_ratio/low_mean": 3.127787306311802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.532525670379982e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8561.109375, + "completions/mean_terminated_length": 7969.79052734375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.9581378549337387, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016026750672608614, + "learning_rate": 1e-05, + "loss": 0.0131, + "num_tokens": 222399046.0, + "reward": 0.34375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 1.653693971093162e-06, + "sampling/sampling_logp_difference/max": 13.312499046325684, + "sampling/sampling_logp_difference/mean": 0.02173236384987831, + "step": 259 + }, + { + "clip_ratio/high_max": 1.4200771602190798e-05, + "clip_ratio/high_mean": 4.3255887476334465e-06, + "clip_ratio/low_mean": 5.2955770115659107e-05, + "clip_ratio/low_min": 3.402656830076012e-06, + "clip_ratio/region_mean": 5.7281358749605715e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16239.0, + "completions/mean_length": 7152.34375, + "completions/mean_terminated_length": 7079.6533203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9052041247487068, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005460259038954973, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 223335010.0, + "reward": 0.4296875, + "reward_std": 0.3356297016143799, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966621398926, + "sampling/importance_sampling_ratio/min": 0.010161337442696095, + "sampling/sampling_logp_difference/max": 4.589165210723877, + "sampling/sampling_logp_difference/mean": 0.01986619457602501, + "step": 260 + }, + { + "clip_ratio/high_max": 1.4350314813782461e-05, + "clip_ratio/high_mean": 3.5875787034456152e-06, + "clip_ratio/low_mean": 3.81288905373367e-05, + "clip_ratio/low_min": 8.099272235995159e-06, + "clip_ratio/region_mean": 4.1716469809216505e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 6678.65625, + "completions/mean_terminated_length": 6524.603515625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.9043187350034714, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005933742038905621, + "learning_rate": 1e-05, + "loss": 0.0966, + "num_tokens": 224207006.0, + "reward": 0.484375, + "reward_std": 0.3316681981086731, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000031590461731, + "sampling/importance_sampling_ratio/min": 0.0011734943836927414, + "sampling/sampling_logp_difference/max": 6.747769355773926, + "sampling/sampling_logp_difference/mean": 0.019827336072921753, + "step": 261 + }, + { + "clip_ratio/high_max": 1.6498819377375185e-05, + "clip_ratio/high_mean": 4.124704844343796e-06, + "clip_ratio/low_mean": 3.601791678420341e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014262168539062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6999.0390625, + "completions/mean_terminated_length": 6850.07177734375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8109970837831497, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003635740838944912, + "learning_rate": 1e-05, + "loss": 0.104, + "num_tokens": 225122891.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303817749023, + "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05, + "sampling/sampling_logp_difference/max": 10.987512588500977, + "sampling/sampling_logp_difference/mean": 0.018912551924586296, + "step": 262 + }, + { + "clip_ratio/high_max": 9.527577958579059e-06, + "clip_ratio/high_mean": 2.3818944896447647e-06, + "clip_ratio/low_mean": 3.766565987461945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004755419373396e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7483.7109375, + "completions/mean_terminated_length": 7045.9912109375, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "entropy": 0.9473970532417297, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003405241761356592, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 226102462.0, + "reward": 0.4453125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002920627594, + "sampling/importance_sampling_ratio/min": 0.00525119062513113, + "sampling/sampling_logp_difference/max": 5.249300479888916, + "sampling/sampling_logp_difference/mean": 0.021076779812574387, + "step": 263 + }, + { + "clip_ratio/high_max": 1.5867321963014547e-05, + "clip_ratio/high_mean": 3.966830490753637e-06, + "clip_ratio/low_mean": 3.8259706570897833e-05, + "clip_ratio/low_min": 3.549019083948224e-06, + "clip_ratio/region_mean": 4.2226537743772496e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 7569.03125, + "completions/mean_terminated_length": 7357.47216796875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9231455475091934, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025927501264959574, + "learning_rate": 1e-05, + "loss": 0.0801, + "num_tokens": 227093562.0, + "reward": 0.3984375, + "reward_std": 0.19097033143043518, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0052477638237178326, + "sampling/sampling_logp_difference/max": 5.249953269958496, + "sampling/sampling_logp_difference/mean": 0.020578444004058838, + "step": 264 + }, + { + "clip_ratio/high_max": 1.344091060673236e-05, + "clip_ratio/high_mean": 3.36022765168309e-06, + "clip_ratio/low_mean": 4.253613235505327e-05, + "clip_ratio/low_min": 3.5579084851633525e-06, + "clip_ratio/region_mean": 4.5896360120423196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 7589.2734375, + "completions/mean_terminated_length": 7378.2001953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9265239909291267, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030512227676808834, + "learning_rate": 1e-05, + "loss": 0.04, + "num_tokens": 228086405.0, + "reward": 0.4296875, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0002165911573683843, + "sampling/sampling_logp_difference/max": 8.437499046325684, + "sampling/sampling_logp_difference/mean": 0.020208362489938736, + "step": 265 + }, + { + "clip_ratio/high_max": 1.9613525410022703e-05, + "clip_ratio/high_mean": 4.903381352505676e-06, + "clip_ratio/low_mean": 3.184792547017423e-05, + "clip_ratio/low_min": 7.29296516510658e-06, + "clip_ratio/region_mean": 3.675130722058384e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 8420.6875, + "completions/mean_terminated_length": 8096.97509765625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "entropy": 0.9572964608669281, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022430522367358208, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 229183765.0, + "reward": 0.34375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 0.00029693738906644285, + "sampling/sampling_logp_difference/max": 8.121989250183105, + "sampling/sampling_logp_difference/mean": 0.021570362150669098, + "step": 266 + }, + { + "clip_ratio/high_max": 6.728750577167375e-06, + "clip_ratio/high_mean": 1.6821876442918438e-06, + "clip_ratio/low_mean": 2.1682553096979973e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.336474062758498e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15736.0, + "completions/mean_length": 6809.765625, + "completions/mean_terminated_length": 6579.984375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.884086549282074, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004295065999031067, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 230077607.0, + "reward": 0.484375, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00754612497985363, + "sampling/sampling_logp_difference/max": 4.886721134185791, + "sampling/sampling_logp_difference/mean": 0.019895706325769424, + "step": 267 + }, + { + "clip_ratio/high_max": 2.8609347509700456e-05, + "clip_ratio/high_mean": 7.152336877425114e-06, + "clip_ratio/low_mean": 5.158006410965754e-05, + "clip_ratio/low_min": 5.210069957684027e-06, + "clip_ratio/region_mean": 5.873240070286556e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15080.0, + "completions/mean_length": 7340.6953125, + "completions/mean_terminated_length": 6973.0810546875, + "completions/min_length": 1616.0, + "completions/min_terminated_length": 1616.0, + "entropy": 0.9920620769262314, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004631794057786465, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 231035616.0, + "reward": 0.4375, + "reward_std": 0.3235401213169098, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.0002508950710762292, + "sampling/sampling_logp_difference/max": 8.290475845336914, + "sampling/sampling_logp_difference/mean": 0.020591016858816147, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.3085940774290066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3085940774290066e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14120.0, + "completions/mean_length": 6748.875, + "completions/mean_terminated_length": 6595.93701171875, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.9867061004042625, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035752104595303535, + "learning_rate": 1e-05, + "loss": 0.0455, + "num_tokens": 231920056.0, + "reward": 0.40625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.0003869794018100947, + "sampling/sampling_logp_difference/max": 7.8571391105651855, + "sampling/sampling_logp_difference/mean": 0.02061416581273079, + "step": 269 + }, + { + "clip_ratio/high_max": 1.2506750408647349e-05, + "clip_ratio/high_mean": 3.1266876021618373e-06, + "clip_ratio/low_mean": 3.10397430212106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.416643085074611e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 7260.3046875, + "completions/mean_terminated_length": 7188.46435546875, + "completions/min_length": 1384.0, + "completions/min_terminated_length": 1384.0, + "entropy": 1.0388494208455086, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036644963547587395, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 232869159.0, + "reward": 0.390625, + "reward_std": 0.2359209954738617, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999546408653259, + "sampling/importance_sampling_ratio/min": 0.0008660226594656706, + "sampling/sampling_logp_difference/max": 7.051599502563477, + "sampling/sampling_logp_difference/mean": 0.02120530977845192, + "step": 270 + }, + { + "clip_ratio/high_max": 2.704355301830219e-05, + "clip_ratio/high_mean": 6.760888254575548e-06, + "clip_ratio/low_mean": 3.1861192269388994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862208097871189e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16073.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 6354.4609375, + "completions/mean_terminated_length": 6354.4609375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "entropy": 0.8405331820249557, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004709267523139715, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 233702842.0, + "reward": 0.546875, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 0.0046309432946145535, + "sampling/sampling_logp_difference/max": 5.37499475479126, + "sampling/sampling_logp_difference/mean": 0.019126038998365402, + "step": 271 + }, + { + "clip_ratio/high_max": 9.749228638611385e-06, + "clip_ratio/high_mean": 2.437307159652846e-06, + "clip_ratio/low_mean": 3.855073941849696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.098804652130639e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6514.578125, + "completions/mean_terminated_length": 6357.9208984375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 1.0254098922014236, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003066045930609107, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 234556348.0, + "reward": 0.4375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 0.005210204049944878, + "sampling/sampling_logp_difference/max": 5.257136344909668, + "sampling/sampling_logp_difference/mean": 0.019960148259997368, + "step": 272 + }, + { + "clip_ratio/high_max": 1.0475813724042382e-05, + "clip_ratio/high_mean": 2.6189534310105955e-06, + "clip_ratio/low_mean": 3.487835761006863e-05, + "clip_ratio/low_min": 2.9392399483185727e-06, + "clip_ratio/region_mean": 3.749731081370555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 7379.5546875, + "completions/mean_terminated_length": 7236.62744140625, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 1.0397320613265038, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005132520105689764, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 235521091.0, + "reward": 0.2890625, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999256134033203, + "sampling/importance_sampling_ratio/min": 0.00016659013635944575, + "sampling/sampling_logp_difference/max": 8.699974060058594, + "sampling/sampling_logp_difference/mean": 0.021417103707790375, + "step": 273 + }, + { + "clip_ratio/high_max": 1.9904123973901733e-05, + "clip_ratio/high_mean": 5.776861314643611e-06, + "clip_ratio/low_mean": 2.6659268655748747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2436129686175263e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 7837.1640625, + "completions/mean_terminated_length": 7632.04052734375, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "entropy": 0.8400963917374611, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028969801496714354, + "learning_rate": 1e-05, + "loss": 0.0143, + "num_tokens": 236544160.0, + "reward": 0.3828125, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887943267822, + "sampling/importance_sampling_ratio/min": 2.883308241052873e-07, + "sampling/sampling_logp_difference/max": 15.059157371520996, + "sampling/sampling_logp_difference/mean": 0.019267702475190163, + "step": 274 + }, + { + "clip_ratio/high_max": 8.562770290154731e-06, + "clip_ratio/high_mean": 2.1406925725386827e-06, + "clip_ratio/low_mean": 4.060094340729847e-05, + "clip_ratio/low_min": 3.8700886761944275e-06, + "clip_ratio/region_mean": 4.2741635979837156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15350.0, + "completions/mean_length": 6696.3515625, + "completions/mean_terminated_length": 6542.57958984375, + "completions/min_length": 1239.0, + "completions/min_terminated_length": 1239.0, + "entropy": 0.8495818004012108, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003412836929783225, + "learning_rate": 1e-05, + "loss": 0.0803, + "num_tokens": 237423101.0, + "reward": 0.515625, + "reward_std": 0.37981897592544556, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.012152798473834991, + "sampling/sampling_logp_difference/max": 4.410195827484131, + "sampling/sampling_logp_difference/mean": 0.018458625301718712, + "step": 275 + }, + { + "clip_ratio/high_max": 1.1463653436294408e-05, + "clip_ratio/high_mean": 3.646129641765583e-06, + "clip_ratio/low_mean": 6.144847083078275e-05, + "clip_ratio/low_min": 1.110105540647055e-05, + "clip_ratio/region_mean": 6.509460160941671e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15666.0, + "completions/mean_length": 7700.3671875, + "completions/mean_terminated_length": 7121.45849609375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.8258870914578438, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024443145375698805, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 238429956.0, + "reward": 0.375, + "reward_std": 0.2872493863105774, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999113082885742, + "sampling/importance_sampling_ratio/min": 0.00026112530031241477, + "sampling/sampling_logp_difference/max": 8.250510215759277, + "sampling/sampling_logp_difference/mean": 0.019427984952926636, + "step": 276 + }, + { + "clip_ratio/high_max": 4.218127742205979e-06, + "clip_ratio/high_mean": 1.0545319355514948e-06, + "clip_ratio/low_mean": 1.7289162997258245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.834369493280974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16112.0, + "completions/mean_length": 6255.21875, + "completions/mean_terminated_length": 6094.44482421875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.8179014846682549, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022747826296836138, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 239250160.0, + "reward": 0.5234375, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.0002633975527714938, + "sampling/sampling_logp_difference/max": 8.241846084594727, + "sampling/sampling_logp_difference/mean": 0.018723051995038986, + "step": 277 + }, + { + "clip_ratio/high_max": 1.698448841125355e-05, + "clip_ratio/high_mean": 5.369374321162468e-06, + "clip_ratio/low_mean": 6.14647315160255e-05, + "clip_ratio/low_min": 5.043576493335422e-06, + "clip_ratio/region_mean": 6.683410583718796e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15321.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6914.9609375, + "completions/mean_terminated_length": 6914.9609375, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9700981751084328, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005685295443981886, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 240156211.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05, + "sampling/sampling_logp_difference/max": 9.997581481933594, + "sampling/sampling_logp_difference/mean": 0.021195171400904655, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9186837764427764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9186837764427764e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15469.0, + "completions/mean_length": 5227.53125, + "completions/mean_terminated_length": 5139.68505859375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9116031974554062, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003880272386595607, + "learning_rate": 1e-05, + "loss": 0.1246, + "num_tokens": 240845295.0, + "reward": 0.6328125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000362396240234, + "sampling/importance_sampling_ratio/min": 0.00012422871077433228, + "sampling/sampling_logp_difference/max": 8.993386268615723, + "sampling/sampling_logp_difference/mean": 0.018801718950271606, + "step": 279 + }, + { + "clip_ratio/high_max": 2.5015486926349695e-05, + "clip_ratio/high_mean": 8.084949570275057e-06, + "clip_ratio/low_mean": 5.524710468307603e-05, + "clip_ratio/low_min": 3.776891389861703e-06, + "clip_ratio/region_mean": 6.333205465125502e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 8065.4765625, + "completions/mean_terminated_length": 7510.90869140625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.7446574792265892, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028986844699829817, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 241895676.0, + "reward": 0.4921875, + "reward_std": 0.3474721610546112, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.0017039099475368857, + "sampling/sampling_logp_difference/max": 6.3748297691345215, + "sampling/sampling_logp_difference/mean": 0.01853121444582939, + "step": 280 + }, + { + "clip_ratio/high_max": 9.486341014053323e-06, + "clip_ratio/high_mean": 2.371585253513331e-06, + "clip_ratio/low_mean": 2.896106741445692e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133265261112683e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15534.0, + "completions/max_terminated_length": 15534.0, + "completions/mean_length": 6127.359375, + "completions/mean_terminated_length": 6127.359375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.8569132760167122, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003845847910270095, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 242698258.0, + "reward": 0.53125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000942945480347, + "sampling/importance_sampling_ratio/min": 0.00043231461313553154, + "sampling/sampling_logp_difference/max": 7.746356964111328, + "sampling/sampling_logp_difference/mean": 0.01856958493590355, + "step": 281 + }, + { + "clip_ratio/high_max": 2.9848330086679198e-05, + "clip_ratio/high_mean": 7.4620825216697995e-06, + "clip_ratio/low_mean": 4.3558867673709756e-05, + "clip_ratio/low_min": 4.417741820361698e-06, + "clip_ratio/region_mean": 5.1020949285884853e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15192.0, + "completions/mean_length": 6600.1484375, + "completions/mean_terminated_length": 6365.33642578125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.78924310952425, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003953634761273861, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 243560957.0, + "reward": 0.5546875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.0006525487406179309, + "sampling/sampling_logp_difference/max": 7.334624767303467, + "sampling/sampling_logp_difference/mean": 0.018097909167408943, + "step": 282 + }, + { + "clip_ratio/high_max": 6.635561703660642e-06, + "clip_ratio/high_mean": 1.6588904259151604e-06, + "clip_ratio/low_mean": 2.737523408313791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9034124281679397e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7852.171875, + "completions/mean_terminated_length": 7852.171875, + "completions/min_length": 1276.0, + "completions/min_terminated_length": 1276.0, + "entropy": 1.0598893761634827, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00360781978815794, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 244585923.0, + "reward": 0.3125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05, + "sampling/sampling_logp_difference/max": 10.076086044311523, + "sampling/sampling_logp_difference/mean": 0.022330068051815033, + "step": 283 + }, + { + "clip_ratio/high_max": 3.1540168947685743e-06, + "clip_ratio/high_mean": 7.885042236921436e-07, + "clip_ratio/low_mean": 4.7973388973332476e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.876189268543385e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7972.2265625, + "completions/mean_terminated_length": 7700.87890625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.933217465877533, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0027661293279379606, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 245628064.0, + "reward": 0.28125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05, + "sampling/sampling_logp_difference/max": 10.366576194763184, + "sampling/sampling_logp_difference/mean": 0.021125148981809616, + "step": 284 + }, + { + "clip_ratio/high_max": 1.2965969062861404e-05, + "clip_ratio/high_mean": 3.241492265715351e-06, + "clip_ratio/low_mean": 4.6317693090713874e-05, + "clip_ratio/low_min": 3.820877282123547e-06, + "clip_ratio/region_mean": 4.955918507221213e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7135.6953125, + "completions/mean_terminated_length": 6913.736328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.7786942347884178, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005680318456143141, + "learning_rate": 1e-05, + "loss": 0.0786, + "num_tokens": 246561329.0, + "reward": 0.4296875, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462366104126, + "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05, + "sampling/sampling_logp_difference/max": 9.737424850463867, + "sampling/sampling_logp_difference/mean": 0.018504241481423378, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.22437145175536e-05, + "clip_ratio/low_min": 1.4025082009538892e-05, + "clip_ratio/region_mean": 4.22437145175536e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6704.046875, + "completions/mean_terminated_length": 6627.82666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 1.0435140281915665, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026402862276881933, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 247437415.0, + "reward": 0.3828125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.0007800163584761322, + "sampling/sampling_logp_difference/max": 7.156195640563965, + "sampling/sampling_logp_difference/mean": 0.02134273201227188, + "step": 286 + }, + { + "clip_ratio/high_max": 2.223430897174694e-05, + "clip_ratio/high_mean": 6.8746438159905665e-06, + "clip_ratio/low_mean": 4.7084630978133646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3959275192028144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 5892.5078125, + "completions/mean_terminated_length": 5725.9765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.8004944771528244, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003993614576756954, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 248211112.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0024652592837810516, + "sampling/sampling_logp_difference/max": 6.005458354949951, + "sampling/sampling_logp_difference/mean": 0.01924925297498703, + "step": 287 + }, + { + "clip_ratio/high_max": 2.1833082200828358e-05, + "clip_ratio/high_mean": 5.458270550207089e-06, + "clip_ratio/low_mean": 3.415995615796419e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961822596920683e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 7812.140625, + "completions/mean_terminated_length": 7316.24755859375, + "completions/min_length": 1515.0, + "completions/min_terminated_length": 1515.0, + "entropy": 0.8841542899608612, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001573400106281042, + "learning_rate": 1e-05, + "loss": 0.0823, + "num_tokens": 249228106.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 0.001001527882181108, + "sampling/sampling_logp_difference/max": 6.906228542327881, + "sampling/sampling_logp_difference/mean": 0.01956877112388611, + "step": 288 + }, + { + "clip_ratio/high_max": 1.014439021673752e-05, + "clip_ratio/high_mean": 2.53609755418438e-06, + "clip_ratio/low_mean": 3.068193461785995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.321803217204433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 6372.953125, + "completions/mean_terminated_length": 6132.6884765625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.8228401988744736, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021125099156051874, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 250063284.0, + "reward": 0.5, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05, + "sampling/sampling_logp_difference/max": 9.937475204467773, + "sampling/sampling_logp_difference/mean": 0.01943521574139595, + "step": 289 + }, + { + "clip_ratio/high_max": 7.023906164249638e-06, + "clip_ratio/high_mean": 1.7559765410624095e-06, + "clip_ratio/low_mean": 2.526416994896863e-05, + "clip_ratio/low_min": 6.7760895490209805e-06, + "clip_ratio/region_mean": 2.7020146660561295e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16270.0, + "completions/mean_length": 7817.8671875, + "completions/mean_terminated_length": 7396.58154296875, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.9454319775104523, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022315154783427715, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 251085123.0, + "reward": 0.40625, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06, + "sampling/sampling_logp_difference/max": 12.760490417480469, + "sampling/sampling_logp_difference/mean": 0.021764669567346573, + "step": 290 + }, + { + "clip_ratio/high_max": 1.4797966287005693e-05, + "clip_ratio/high_mean": 3.699491571751423e-06, + "clip_ratio/low_mean": 4.36271948274225e-05, + "clip_ratio/low_min": 3.6957101201551268e-06, + "clip_ratio/region_mean": 4.732668639917392e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 7168.4921875, + "completions/mean_terminated_length": 6635.36328125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8433891162276268, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 252020906.0, + "reward": 0.5546875, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.0003851866349577904, + "sampling/sampling_logp_difference/max": 7.861782550811768, + "sampling/sampling_logp_difference/mean": 0.01929781585931778, + "step": 291 + }, + { + "clip_ratio/high_max": 1.996871560550062e-05, + "clip_ratio/high_mean": 6.089093403716106e-06, + "clip_ratio/low_mean": 4.2792244585143635e-05, + "clip_ratio/low_min": 1.0337215371691855e-05, + "clip_ratio/region_mean": 4.8881338216233416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7322.5078125, + "completions/mean_terminated_length": 6876.8603515625, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 0.9157031401991844, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036942458245903254, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 252977435.0, + "reward": 0.3359375, + "reward_std": 0.24275577068328857, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.00029605376766994596, + "sampling/sampling_logp_difference/max": 8.124969482421875, + "sampling/sampling_logp_difference/mean": 0.0205365102738142, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.631919460327481e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.631919460327481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16078.0, + "completions/mean_length": 7025.484375, + "completions/mean_terminated_length": 6723.5966796875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 1.1329731941223145, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034127074759453535, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 253896161.0, + "reward": 0.25, + "reward_std": 0.27722424268722534, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0005197672289796174, + "sampling/sampling_logp_difference/max": 7.562129497528076, + "sampling/sampling_logp_difference/mean": 0.023741140961647034, + "step": 293 + }, + { + "clip_ratio/high_max": 4.368643658381188e-06, + "clip_ratio/high_mean": 1.092160914595297e-06, + "clip_ratio/low_mean": 2.4661783299961826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5753944555617636e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13776.0, + "completions/mean_length": 5996.1796875, + "completions/mean_terminated_length": 5661.08837890625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8773328885436058, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003959407564252615, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 254690264.0, + "reward": 0.53125, + "reward_std": 0.26645541191101074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07, + "sampling/sampling_logp_difference/max": 15.73043155670166, + "sampling/sampling_logp_difference/mean": 0.018407585099339485, + "step": 294 + }, + { + "clip_ratio/high_max": 1.616483677935321e-05, + "clip_ratio/high_mean": 4.041209194838302e-06, + "clip_ratio/low_mean": 3.736187466074625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140308453770558e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7165.328125, + "completions/mean_terminated_length": 6867.951171875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.9502597972750664, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030910037457942963, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 255626394.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000731945037842, + "sampling/importance_sampling_ratio/min": 0.00022311302018351853, + "sampling/sampling_logp_difference/max": 8.407832145690918, + "sampling/sampling_logp_difference/mean": 0.020668907091021538, + "step": 295 + }, + { + "clip_ratio/high_max": 1.1702686606440693e-05, + "clip_ratio/high_mean": 2.9256716516101733e-06, + "clip_ratio/low_mean": 5.5247357522603124e-05, + "clip_ratio/low_min": 3.6811261452385224e-06, + "clip_ratio/region_mean": 5.8173028264718596e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15375.0, + "completions/mean_length": 8001.9296875, + "completions/mean_terminated_length": 7661.34912109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8591345250606537, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037233952898532152, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 256673457.0, + "reward": 0.421875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.0021876997780054808, + "sampling/sampling_logp_difference/max": 6.124904632568359, + "sampling/sampling_logp_difference/mean": 0.020540472120046616, + "step": 296 + }, + { + "clip_ratio/high_max": 3.721341136042611e-05, + "clip_ratio/high_mean": 1.2759249216287571e-05, + "clip_ratio/low_mean": 3.570647322703735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.846572301175911e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 6924.84375, + "completions/mean_terminated_length": 6697.82421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.7969356626272202, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006054217461496592, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 257578501.0, + "reward": 0.5078125, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.007889713160693645, + "sampling/sampling_logp_difference/max": 4.842195510864258, + "sampling/sampling_logp_difference/mean": 0.019306108355522156, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0211543894911301e-05, + "clip_ratio/high_mean": 2.5528859737278253e-06, + "clip_ratio/low_mean": 5.2388056587915344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4940942732173426e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14439.0, + "completions/mean_length": 6203.03125, + "completions/mean_terminated_length": 5958.6884765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.8734413683414459, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004903806839138269, + "learning_rate": 1e-05, + "loss": 0.0689, + "num_tokens": 258392625.0, + "reward": 0.4453125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 0.00020370795391499996, + "sampling/sampling_logp_difference/max": 8.498823165893555, + "sampling/sampling_logp_difference/mean": 0.01909301057457924, + "step": 298 + }, + { + "clip_ratio/high_max": 1.5135058674786706e-05, + "clip_ratio/high_mean": 4.64845766146027e-06, + "clip_ratio/low_mean": 4.373456977191381e-05, + "clip_ratio/low_min": 3.670856358439778e-06, + "clip_ratio/region_mean": 4.8383026296505705e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 7982.5390625, + "completions/mean_terminated_length": 7641.01611328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0091779381036758, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033637424930930138, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 259435270.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999765753746033, + "sampling/importance_sampling_ratio/min": 0.0016514655435457826, + "sampling/sampling_logp_difference/max": 6.406092166900635, + "sampling/sampling_logp_difference/mean": 0.02182736061513424, + "step": 299 + }, + { + "clip_ratio/high_max": 2.3964702677403693e-05, + "clip_ratio/high_mean": 5.991175669350923e-06, + "clip_ratio/low_mean": 5.2442986770984135e-05, + "clip_ratio/low_min": 8.75736759553547e-06, + "clip_ratio/region_mean": 5.843416238349164e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6915.3125, + "completions/mean_terminated_length": 6688.064453125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.7964543774724007, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052203768864274025, + "learning_rate": 1e-05, + "loss": 0.144, + "num_tokens": 260337614.0, + "reward": 0.46875, + "reward_std": 0.37928223609924316, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 7.032832218101248e-05, + "sampling/sampling_logp_difference/max": 9.562335968017578, + "sampling/sampling_logp_difference/mean": 0.017896221950650215, + "step": 300 + }, + { + "clip_ratio/high_max": 4.458271632756805e-05, + "clip_ratio/high_mean": 1.1145679081892013e-05, + "clip_ratio/low_mean": 6.243192206056847e-05, + "clip_ratio/low_min": 1.2397775662975619e-05, + "clip_ratio/region_mean": 7.357759886872373e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7029.4375, + "completions/mean_terminated_length": 6880.95263671875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.8605096861720085, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005570738110691309, + "learning_rate": 1e-05, + "loss": 0.0984, + "num_tokens": 261254070.0, + "reward": 0.4765625, + "reward_std": 0.3327290117740631, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999494552612305, + "sampling/importance_sampling_ratio/min": 0.0009070249507203698, + "sampling/sampling_logp_difference/max": 7.005340576171875, + "sampling/sampling_logp_difference/mean": 0.01905740052461624, + "step": 301 + }, + { + "clip_ratio/high_max": 3.390461233720998e-05, + "clip_ratio/high_mean": 1.1191766247975465e-05, + "clip_ratio/low_mean": 7.46641262594494e-05, + "clip_ratio/low_min": 5.041745680500753e-06, + "clip_ratio/region_mean": 8.585589102949598e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5858.84375, + "completions/mean_terminated_length": 5606.240234375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8430554121732712, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004496110137552023, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 262024906.0, + "reward": 0.4453125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294877052307, + "sampling/importance_sampling_ratio/min": 0.00040469475788995624, + "sampling/sampling_logp_difference/max": 7.812377452850342, + "sampling/sampling_logp_difference/mean": 0.019225869327783585, + "step": 302 + }, + { + "clip_ratio/high_max": 3.2563955301156966e-06, + "clip_ratio/high_mean": 8.140988825289242e-07, + "clip_ratio/low_mean": 3.7080020149460324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.789411886145899e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15976.0, + "completions/mean_length": 8337.328125, + "completions/mean_terminated_length": 7728.7568359375, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.901745393872261, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00348713924176991, + "learning_rate": 1e-05, + "loss": -0.0002, + "num_tokens": 263110844.0, + "reward": 0.296875, + "reward_std": 0.20805485546588898, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.0022652465850114822, + "sampling/sampling_logp_difference/max": 6.090071678161621, + "sampling/sampling_logp_difference/mean": 0.02157524600625038, + "step": 303 + }, + { + "clip_ratio/high_max": 2.3739744847262045e-05, + "clip_ratio/high_mean": 5.934936211815511e-06, + "clip_ratio/low_mean": 2.823553325015382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.417046866616147e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7084.7265625, + "completions/mean_terminated_length": 6381.42041015625, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8265534415841103, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003980033565312624, + "learning_rate": 1e-05, + "loss": 0.0551, + "num_tokens": 264036169.0, + "reward": 0.3984375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673366546631, + "sampling/importance_sampling_ratio/min": 0.00012345099821686745, + "sampling/sampling_logp_difference/max": 8.999666213989258, + "sampling/sampling_logp_difference/mean": 0.018782664090394974, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1745505617000163e-05, + "clip_ratio/high_mean": 3.771558226617344e-06, + "clip_ratio/low_mean": 6.913120819262986e-05, + "clip_ratio/low_min": 2.494283216947224e-05, + "clip_ratio/region_mean": 7.290276607818669e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6543.796875, + "completions/mean_terminated_length": 6543.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8899869695305824, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.006467343773692846, + "learning_rate": 1e-05, + "loss": 0.1139, + "num_tokens": 264892767.0, + "reward": 0.484375, + "reward_std": 0.3934885561466217, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000489950180054, + "sampling/importance_sampling_ratio/min": 9.891482477542013e-05, + "sampling/sampling_logp_difference/max": 9.221251487731934, + "sampling/sampling_logp_difference/mean": 0.02032080665230751, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.395576979732141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.395576979732141e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16307.0, + "completions/mean_length": 8483.390625, + "completions/mean_terminated_length": 7813.84765625, + "completions/min_length": 1342.0, + "completions/min_terminated_length": 1342.0, + "entropy": 0.9621479511260986, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003174177836626768, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 265995697.0, + "reward": 0.3359375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.0005628522485494614, + "sampling/sampling_logp_difference/max": 7.4824934005737305, + "sampling/sampling_logp_difference/mean": 0.02145479805767536, + "step": 306 + }, + { + "clip_ratio/high_max": 1.2596524811669951e-05, + "clip_ratio/high_mean": 3.149131202917488e-06, + "clip_ratio/low_mean": 3.7911659774181317e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.106079018129094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14985.0, + "completions/mean_length": 7184.578125, + "completions/mean_terminated_length": 6963.79248046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9993807673454285, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003356153378263116, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 266937707.0, + "reward": 0.3828125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.0017036627978086472, + "sampling/sampling_logp_difference/max": 6.374974727630615, + "sampling/sampling_logp_difference/mean": 0.02204768732190132, + "step": 307 + }, + { + "clip_ratio/high_max": 1.9245163684900035e-05, + "clip_ratio/high_mean": 4.811290921225009e-06, + "clip_ratio/low_mean": 4.8845648166206956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.365693925796222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16216.0, + "completions/mean_length": 7029.2265625, + "completions/mean_terminated_length": 6727.45947265625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.9139953926205635, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006375293247401714, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 267853880.0, + "reward": 0.4765625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.010649868287146091, + "sampling/sampling_logp_difference/max": 4.542207717895508, + "sampling/sampling_logp_difference/mean": 0.020365029573440552, + "step": 308 + }, + { + "clip_ratio/high_max": 4.812504812434781e-06, + "clip_ratio/high_mean": 1.2031262031086953e-06, + "clip_ratio/low_mean": 2.5999243803198624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.720237000630732e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6188.0078125, + "completions/mean_terminated_length": 5943.30419921875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.7640773430466652, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003697809297591448, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 268665721.0, + "reward": 0.5078125, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372363090515, + "sampling/importance_sampling_ratio/min": 0.02927250787615776, + "sampling/sampling_logp_difference/max": 3.531106472015381, + "sampling/sampling_logp_difference/mean": 0.016581017524003983, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1358927824621787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1358927824621787e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 8128.21875, + "completions/mean_terminated_length": 7861.90283203125, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.8218234181404114, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002286596456542611, + "learning_rate": 1e-05, + "loss": 0.0763, + "num_tokens": 269726181.0, + "reward": 0.375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798536300659, + "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06, + "sampling/sampling_logp_difference/max": 12.90043830871582, + "sampling/sampling_logp_difference/mean": 0.019403984770178795, + "step": 310 + }, + { + "clip_ratio/high_max": 1.4808477317274082e-05, + "clip_ratio/high_mean": 3.7021193293185206e-06, + "clip_ratio/low_mean": 3.0363167581981543e-05, + "clip_ratio/low_min": 6.364238288369961e-06, + "clip_ratio/region_mean": 3.4065286854456645e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 5673.3359375, + "completions/mean_terminated_length": 5503.32568359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.9275510385632515, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00485506234690547, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 270470616.0, + "reward": 0.4921875, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.0009123464697040617, + "sampling/sampling_logp_difference/max": 6.999490737915039, + "sampling/sampling_logp_difference/mean": 0.01881871558725834, + "step": 311 + }, + { + "clip_ratio/high_max": 1.1274602456978755e-05, + "clip_ratio/high_mean": 3.6739949109687586e-06, + "clip_ratio/low_mean": 3.968570712231667e-05, + "clip_ratio/low_min": 3.4213767321489286e-06, + "clip_ratio/region_mean": 4.335970191959859e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 6944.8984375, + "completions/mean_terminated_length": 6795.07177734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9335741624236107, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005874342750757933, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 271377723.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000594854354858, + "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05, + "sampling/sampling_logp_difference/max": 10.049861907958984, + "sampling/sampling_logp_difference/mean": 0.020590776577591896, + "step": 312 + }, + { + "clip_ratio/high_max": 1.264126694877632e-05, + "clip_ratio/high_mean": 3.16031673719408e-06, + "clip_ratio/low_mean": 3.206376845810155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.522408474054828e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7705.625, + "completions/mean_terminated_length": 7278.8193359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.8491624072194099, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001684082904830575, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 272384891.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 6.605865200981498e-05, + "sampling/sampling_logp_difference/max": 9.624967575073242, + "sampling/sampling_logp_difference/mean": 0.020136822015047073, + "step": 313 + }, + { + "clip_ratio/high_max": 9.772357770998497e-06, + "clip_ratio/high_mean": 2.443089442749624e-06, + "clip_ratio/low_mean": 3.8573590472879005e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101667946088128e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6611.1484375, + "completions/mean_terminated_length": 6534.19677734375, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "entropy": 0.8867302760481834, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003692191792652011, + "learning_rate": 1e-05, + "loss": 0.1233, + "num_tokens": 273251630.0, + "reward": 0.3984375, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.0031062732450664043, + "sampling/sampling_logp_difference/max": 5.774331569671631, + "sampling/sampling_logp_difference/mean": 0.019237037748098373, + "step": 314 + }, + { + "clip_ratio/high_max": 3.0103737344688852e-05, + "clip_ratio/high_mean": 9.664363972206047e-06, + "clip_ratio/low_mean": 1.7575501146893657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.723986426644842e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15786.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 6770.46875, + "completions/mean_terminated_length": 6770.46875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.8252957463264465, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004167635925114155, + "learning_rate": 1e-05, + "loss": -0.0072, + "num_tokens": 274146482.0, + "reward": 0.5703125, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.00010247006866848096, + "sampling/sampling_logp_difference/max": 9.18593978881836, + "sampling/sampling_logp_difference/mean": 0.019684650003910065, + "step": 315 + }, + { + "clip_ratio/high_max": 6.529460733872838e-06, + "clip_ratio/high_mean": 1.6323651834682096e-06, + "clip_ratio/low_mean": 3.877351048231503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.040587566578324e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15827.0, + "completions/mean_length": 8210.859375, + "completions/mean_terminated_length": 7365.36181640625, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.8118235394358635, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030363225378096104, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 275214040.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998943209648132, + "sampling/importance_sampling_ratio/min": 0.002854935359209776, + "sampling/sampling_logp_difference/max": 5.858705997467041, + "sampling/sampling_logp_difference/mean": 0.019275270402431488, + "step": 316 + }, + { + "clip_ratio/high_max": 7.0800629146106075e-06, + "clip_ratio/high_mean": 1.7700157286526519e-06, + "clip_ratio/low_mean": 2.3981688286767167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5751703674359305e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14900.0, + "completions/mean_length": 7072.8828125, + "completions/mean_terminated_length": 6849.41650390625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8018335327506065, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004777858033776283, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 276138049.0, + "reward": 0.453125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 0.0028502768836915493, + "sampling/sampling_logp_difference/max": 5.860339164733887, + "sampling/sampling_logp_difference/mean": 0.01849908009171486, + "step": 317 + }, + { + "clip_ratio/high_max": 2.259368602608447e-05, + "clip_ratio/high_mean": 5.648421506521117e-06, + "clip_ratio/low_mean": 4.28424866640853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.849090737479855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14447.0, + "completions/mean_length": 5889.8359375, + "completions/mean_terminated_length": 5723.26220703125, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.7976400703191757, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030593445990234613, + "learning_rate": 1e-05, + "loss": 0.1331, + "num_tokens": 276910124.0, + "reward": 0.5859375, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.000139843366923742, + "sampling/sampling_logp_difference/max": 8.874987602233887, + "sampling/sampling_logp_difference/mean": 0.01834402233362198, + "step": 318 + }, + { + "clip_ratio/high_max": 1.4654247024736833e-05, + "clip_ratio/high_mean": 3.663561756184208e-06, + "clip_ratio/low_mean": 2.377464920755301e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7438210736363544e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 7144.265625, + "completions/mean_terminated_length": 6689.85205078125, + "completions/min_length": 1200.0, + "completions/min_terminated_length": 1200.0, + "entropy": 0.8309404999017715, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004245694726705551, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 277843542.0, + "reward": 0.4453125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998534321784973, + "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05, + "sampling/sampling_logp_difference/max": 11.499897956848145, + "sampling/sampling_logp_difference/mean": 0.01875344291329384, + "step": 319 + }, + { + "clip_ratio/high_max": 6.252500952541595e-06, + "clip_ratio/high_mean": 2.241558604509919e-06, + "clip_ratio/low_mean": 4.735765514851664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9599213525652885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15722.0, + "completions/mean_length": 6779.5234375, + "completions/mean_terminated_length": 6703.8974609375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.9584890529513359, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035574575886130333, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 278730129.0, + "reward": 0.3984375, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.005792221520096064, + "sampling/sampling_logp_difference/max": 5.151239395141602, + "sampling/sampling_logp_difference/mean": 0.02137477695941925, + "step": 320 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 278730129, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-320/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-320/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-320/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/README.md b/dapo_milora_plus_20251201_131939/checkpoint-384/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-384/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-384/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/latest b/dapo_milora_plus_20251201_131939/checkpoint-384/latest new file mode 100644 index 0000000000000000000000000000000000000000..47a30b050fc0cf5b9cd367ab63c36191546d4ff7 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/latest @@ -0,0 +1 @@ +global_step384 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-384/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-384/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..779f29fd3b8eb44e5067bf4a00b20b8c4015fbb7 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/trainer_state.json @@ -0,0 +1,11938 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3532658693652254, + "eval_steps": 500, + "global_step": 384, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + }, + { + "clip_ratio/high_max": 6.87833608026267e-06, + "clip_ratio/high_mean": 2.9462287329806713e-06, + "clip_ratio/low_mean": 5.435333650893881e-05, + "clip_ratio/low_min": 5.33937054569833e-06, + "clip_ratio/region_mean": 5.729956546929316e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 6448.0078125, + "completions/mean_terminated_length": 6369.771484375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9546648040413857, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004310046322643757, + "learning_rate": 1e-05, + "loss": 0.1082, + "num_tokens": 220304605.0, + "reward": 0.5703125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 0.0001234127557836473, + "sampling/sampling_logp_difference/max": 8.99997615814209, + "sampling/sampling_logp_difference/mean": 0.020253397524356842, + "step": 257 + }, + { + "clip_ratio/high_max": 6.196094091137638e-06, + "clip_ratio/high_mean": 1.5490235227844096e-06, + "clip_ratio/low_mean": 2.5416685957679874e-05, + "clip_ratio/low_min": 5.5736391004757024e-06, + "clip_ratio/region_mean": 2.696570959415112e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 7457.6484375, + "completions/mean_terminated_length": 6941.24755859375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "entropy": 0.8182889074087143, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026646999176591635, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 221281968.0, + "reward": 0.4453125, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173283576965, + "sampling/importance_sampling_ratio/min": 2.902353571698768e-06, + "sampling/sampling_logp_difference/max": 12.749988555908203, + "sampling/sampling_logp_difference/mean": 0.019208962097764015, + "step": 258 + }, + { + "clip_ratio/high_max": 1.6189535017474554e-05, + "clip_ratio/high_mean": 4.047383754368639e-06, + "clip_ratio/low_mean": 3.127787306311802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.532525670379982e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8561.109375, + "completions/mean_terminated_length": 7969.79052734375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.9581378549337387, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016026750672608614, + "learning_rate": 1e-05, + "loss": 0.0131, + "num_tokens": 222399046.0, + "reward": 0.34375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 1.653693971093162e-06, + "sampling/sampling_logp_difference/max": 13.312499046325684, + "sampling/sampling_logp_difference/mean": 0.02173236384987831, + "step": 259 + }, + { + "clip_ratio/high_max": 1.4200771602190798e-05, + "clip_ratio/high_mean": 4.3255887476334465e-06, + "clip_ratio/low_mean": 5.2955770115659107e-05, + "clip_ratio/low_min": 3.402656830076012e-06, + "clip_ratio/region_mean": 5.7281358749605715e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16239.0, + "completions/mean_length": 7152.34375, + "completions/mean_terminated_length": 7079.6533203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9052041247487068, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005460259038954973, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 223335010.0, + "reward": 0.4296875, + "reward_std": 0.3356297016143799, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966621398926, + "sampling/importance_sampling_ratio/min": 0.010161337442696095, + "sampling/sampling_logp_difference/max": 4.589165210723877, + "sampling/sampling_logp_difference/mean": 0.01986619457602501, + "step": 260 + }, + { + "clip_ratio/high_max": 1.4350314813782461e-05, + "clip_ratio/high_mean": 3.5875787034456152e-06, + "clip_ratio/low_mean": 3.81288905373367e-05, + "clip_ratio/low_min": 8.099272235995159e-06, + "clip_ratio/region_mean": 4.1716469809216505e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 6678.65625, + "completions/mean_terminated_length": 6524.603515625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.9043187350034714, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005933742038905621, + "learning_rate": 1e-05, + "loss": 0.0966, + "num_tokens": 224207006.0, + "reward": 0.484375, + "reward_std": 0.3316681981086731, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000031590461731, + "sampling/importance_sampling_ratio/min": 0.0011734943836927414, + "sampling/sampling_logp_difference/max": 6.747769355773926, + "sampling/sampling_logp_difference/mean": 0.019827336072921753, + "step": 261 + }, + { + "clip_ratio/high_max": 1.6498819377375185e-05, + "clip_ratio/high_mean": 4.124704844343796e-06, + "clip_ratio/low_mean": 3.601791678420341e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014262168539062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6999.0390625, + "completions/mean_terminated_length": 6850.07177734375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8109970837831497, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003635740838944912, + "learning_rate": 1e-05, + "loss": 0.104, + "num_tokens": 225122891.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303817749023, + "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05, + "sampling/sampling_logp_difference/max": 10.987512588500977, + "sampling/sampling_logp_difference/mean": 0.018912551924586296, + "step": 262 + }, + { + "clip_ratio/high_max": 9.527577958579059e-06, + "clip_ratio/high_mean": 2.3818944896447647e-06, + "clip_ratio/low_mean": 3.766565987461945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004755419373396e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7483.7109375, + "completions/mean_terminated_length": 7045.9912109375, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "entropy": 0.9473970532417297, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003405241761356592, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 226102462.0, + "reward": 0.4453125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002920627594, + "sampling/importance_sampling_ratio/min": 0.00525119062513113, + "sampling/sampling_logp_difference/max": 5.249300479888916, + "sampling/sampling_logp_difference/mean": 0.021076779812574387, + "step": 263 + }, + { + "clip_ratio/high_max": 1.5867321963014547e-05, + "clip_ratio/high_mean": 3.966830490753637e-06, + "clip_ratio/low_mean": 3.8259706570897833e-05, + "clip_ratio/low_min": 3.549019083948224e-06, + "clip_ratio/region_mean": 4.2226537743772496e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 7569.03125, + "completions/mean_terminated_length": 7357.47216796875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9231455475091934, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025927501264959574, + "learning_rate": 1e-05, + "loss": 0.0801, + "num_tokens": 227093562.0, + "reward": 0.3984375, + "reward_std": 0.19097033143043518, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0052477638237178326, + "sampling/sampling_logp_difference/max": 5.249953269958496, + "sampling/sampling_logp_difference/mean": 0.020578444004058838, + "step": 264 + }, + { + "clip_ratio/high_max": 1.344091060673236e-05, + "clip_ratio/high_mean": 3.36022765168309e-06, + "clip_ratio/low_mean": 4.253613235505327e-05, + "clip_ratio/low_min": 3.5579084851633525e-06, + "clip_ratio/region_mean": 4.5896360120423196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 7589.2734375, + "completions/mean_terminated_length": 7378.2001953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9265239909291267, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030512227676808834, + "learning_rate": 1e-05, + "loss": 0.04, + "num_tokens": 228086405.0, + "reward": 0.4296875, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0002165911573683843, + "sampling/sampling_logp_difference/max": 8.437499046325684, + "sampling/sampling_logp_difference/mean": 0.020208362489938736, + "step": 265 + }, + { + "clip_ratio/high_max": 1.9613525410022703e-05, + "clip_ratio/high_mean": 4.903381352505676e-06, + "clip_ratio/low_mean": 3.184792547017423e-05, + "clip_ratio/low_min": 7.29296516510658e-06, + "clip_ratio/region_mean": 3.675130722058384e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 8420.6875, + "completions/mean_terminated_length": 8096.97509765625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "entropy": 0.9572964608669281, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022430522367358208, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 229183765.0, + "reward": 0.34375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 0.00029693738906644285, + "sampling/sampling_logp_difference/max": 8.121989250183105, + "sampling/sampling_logp_difference/mean": 0.021570362150669098, + "step": 266 + }, + { + "clip_ratio/high_max": 6.728750577167375e-06, + "clip_ratio/high_mean": 1.6821876442918438e-06, + "clip_ratio/low_mean": 2.1682553096979973e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.336474062758498e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15736.0, + "completions/mean_length": 6809.765625, + "completions/mean_terminated_length": 6579.984375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.884086549282074, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004295065999031067, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 230077607.0, + "reward": 0.484375, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00754612497985363, + "sampling/sampling_logp_difference/max": 4.886721134185791, + "sampling/sampling_logp_difference/mean": 0.019895706325769424, + "step": 267 + }, + { + "clip_ratio/high_max": 2.8609347509700456e-05, + "clip_ratio/high_mean": 7.152336877425114e-06, + "clip_ratio/low_mean": 5.158006410965754e-05, + "clip_ratio/low_min": 5.210069957684027e-06, + "clip_ratio/region_mean": 5.873240070286556e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15080.0, + "completions/mean_length": 7340.6953125, + "completions/mean_terminated_length": 6973.0810546875, + "completions/min_length": 1616.0, + "completions/min_terminated_length": 1616.0, + "entropy": 0.9920620769262314, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004631794057786465, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 231035616.0, + "reward": 0.4375, + "reward_std": 0.3235401213169098, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.0002508950710762292, + "sampling/sampling_logp_difference/max": 8.290475845336914, + "sampling/sampling_logp_difference/mean": 0.020591016858816147, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.3085940774290066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3085940774290066e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14120.0, + "completions/mean_length": 6748.875, + "completions/mean_terminated_length": 6595.93701171875, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.9867061004042625, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035752104595303535, + "learning_rate": 1e-05, + "loss": 0.0455, + "num_tokens": 231920056.0, + "reward": 0.40625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.0003869794018100947, + "sampling/sampling_logp_difference/max": 7.8571391105651855, + "sampling/sampling_logp_difference/mean": 0.02061416581273079, + "step": 269 + }, + { + "clip_ratio/high_max": 1.2506750408647349e-05, + "clip_ratio/high_mean": 3.1266876021618373e-06, + "clip_ratio/low_mean": 3.10397430212106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.416643085074611e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 7260.3046875, + "completions/mean_terminated_length": 7188.46435546875, + "completions/min_length": 1384.0, + "completions/min_terminated_length": 1384.0, + "entropy": 1.0388494208455086, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036644963547587395, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 232869159.0, + "reward": 0.390625, + "reward_std": 0.2359209954738617, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999546408653259, + "sampling/importance_sampling_ratio/min": 0.0008660226594656706, + "sampling/sampling_logp_difference/max": 7.051599502563477, + "sampling/sampling_logp_difference/mean": 0.02120530977845192, + "step": 270 + }, + { + "clip_ratio/high_max": 2.704355301830219e-05, + "clip_ratio/high_mean": 6.760888254575548e-06, + "clip_ratio/low_mean": 3.1861192269388994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862208097871189e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16073.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 6354.4609375, + "completions/mean_terminated_length": 6354.4609375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "entropy": 0.8405331820249557, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004709267523139715, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 233702842.0, + "reward": 0.546875, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 0.0046309432946145535, + "sampling/sampling_logp_difference/max": 5.37499475479126, + "sampling/sampling_logp_difference/mean": 0.019126038998365402, + "step": 271 + }, + { + "clip_ratio/high_max": 9.749228638611385e-06, + "clip_ratio/high_mean": 2.437307159652846e-06, + "clip_ratio/low_mean": 3.855073941849696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.098804652130639e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6514.578125, + "completions/mean_terminated_length": 6357.9208984375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 1.0254098922014236, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003066045930609107, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 234556348.0, + "reward": 0.4375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 0.005210204049944878, + "sampling/sampling_logp_difference/max": 5.257136344909668, + "sampling/sampling_logp_difference/mean": 0.019960148259997368, + "step": 272 + }, + { + "clip_ratio/high_max": 1.0475813724042382e-05, + "clip_ratio/high_mean": 2.6189534310105955e-06, + "clip_ratio/low_mean": 3.487835761006863e-05, + "clip_ratio/low_min": 2.9392399483185727e-06, + "clip_ratio/region_mean": 3.749731081370555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 7379.5546875, + "completions/mean_terminated_length": 7236.62744140625, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 1.0397320613265038, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005132520105689764, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 235521091.0, + "reward": 0.2890625, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999256134033203, + "sampling/importance_sampling_ratio/min": 0.00016659013635944575, + "sampling/sampling_logp_difference/max": 8.699974060058594, + "sampling/sampling_logp_difference/mean": 0.021417103707790375, + "step": 273 + }, + { + "clip_ratio/high_max": 1.9904123973901733e-05, + "clip_ratio/high_mean": 5.776861314643611e-06, + "clip_ratio/low_mean": 2.6659268655748747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2436129686175263e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 7837.1640625, + "completions/mean_terminated_length": 7632.04052734375, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "entropy": 0.8400963917374611, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028969801496714354, + "learning_rate": 1e-05, + "loss": 0.0143, + "num_tokens": 236544160.0, + "reward": 0.3828125, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887943267822, + "sampling/importance_sampling_ratio/min": 2.883308241052873e-07, + "sampling/sampling_logp_difference/max": 15.059157371520996, + "sampling/sampling_logp_difference/mean": 0.019267702475190163, + "step": 274 + }, + { + "clip_ratio/high_max": 8.562770290154731e-06, + "clip_ratio/high_mean": 2.1406925725386827e-06, + "clip_ratio/low_mean": 4.060094340729847e-05, + "clip_ratio/low_min": 3.8700886761944275e-06, + "clip_ratio/region_mean": 4.2741635979837156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15350.0, + "completions/mean_length": 6696.3515625, + "completions/mean_terminated_length": 6542.57958984375, + "completions/min_length": 1239.0, + "completions/min_terminated_length": 1239.0, + "entropy": 0.8495818004012108, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003412836929783225, + "learning_rate": 1e-05, + "loss": 0.0803, + "num_tokens": 237423101.0, + "reward": 0.515625, + "reward_std": 0.37981897592544556, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.012152798473834991, + "sampling/sampling_logp_difference/max": 4.410195827484131, + "sampling/sampling_logp_difference/mean": 0.018458625301718712, + "step": 275 + }, + { + "clip_ratio/high_max": 1.1463653436294408e-05, + "clip_ratio/high_mean": 3.646129641765583e-06, + "clip_ratio/low_mean": 6.144847083078275e-05, + "clip_ratio/low_min": 1.110105540647055e-05, + "clip_ratio/region_mean": 6.509460160941671e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15666.0, + "completions/mean_length": 7700.3671875, + "completions/mean_terminated_length": 7121.45849609375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.8258870914578438, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024443145375698805, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 238429956.0, + "reward": 0.375, + "reward_std": 0.2872493863105774, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999113082885742, + "sampling/importance_sampling_ratio/min": 0.00026112530031241477, + "sampling/sampling_logp_difference/max": 8.250510215759277, + "sampling/sampling_logp_difference/mean": 0.019427984952926636, + "step": 276 + }, + { + "clip_ratio/high_max": 4.218127742205979e-06, + "clip_ratio/high_mean": 1.0545319355514948e-06, + "clip_ratio/low_mean": 1.7289162997258245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.834369493280974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16112.0, + "completions/mean_length": 6255.21875, + "completions/mean_terminated_length": 6094.44482421875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.8179014846682549, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022747826296836138, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 239250160.0, + "reward": 0.5234375, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.0002633975527714938, + "sampling/sampling_logp_difference/max": 8.241846084594727, + "sampling/sampling_logp_difference/mean": 0.018723051995038986, + "step": 277 + }, + { + "clip_ratio/high_max": 1.698448841125355e-05, + "clip_ratio/high_mean": 5.369374321162468e-06, + "clip_ratio/low_mean": 6.14647315160255e-05, + "clip_ratio/low_min": 5.043576493335422e-06, + "clip_ratio/region_mean": 6.683410583718796e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15321.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6914.9609375, + "completions/mean_terminated_length": 6914.9609375, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9700981751084328, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005685295443981886, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 240156211.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05, + "sampling/sampling_logp_difference/max": 9.997581481933594, + "sampling/sampling_logp_difference/mean": 0.021195171400904655, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9186837764427764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9186837764427764e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15469.0, + "completions/mean_length": 5227.53125, + "completions/mean_terminated_length": 5139.68505859375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9116031974554062, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003880272386595607, + "learning_rate": 1e-05, + "loss": 0.1246, + "num_tokens": 240845295.0, + "reward": 0.6328125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000362396240234, + "sampling/importance_sampling_ratio/min": 0.00012422871077433228, + "sampling/sampling_logp_difference/max": 8.993386268615723, + "sampling/sampling_logp_difference/mean": 0.018801718950271606, + "step": 279 + }, + { + "clip_ratio/high_max": 2.5015486926349695e-05, + "clip_ratio/high_mean": 8.084949570275057e-06, + "clip_ratio/low_mean": 5.524710468307603e-05, + "clip_ratio/low_min": 3.776891389861703e-06, + "clip_ratio/region_mean": 6.333205465125502e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 8065.4765625, + "completions/mean_terminated_length": 7510.90869140625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.7446574792265892, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028986844699829817, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 241895676.0, + "reward": 0.4921875, + "reward_std": 0.3474721610546112, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.0017039099475368857, + "sampling/sampling_logp_difference/max": 6.3748297691345215, + "sampling/sampling_logp_difference/mean": 0.01853121444582939, + "step": 280 + }, + { + "clip_ratio/high_max": 9.486341014053323e-06, + "clip_ratio/high_mean": 2.371585253513331e-06, + "clip_ratio/low_mean": 2.896106741445692e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133265261112683e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15534.0, + "completions/max_terminated_length": 15534.0, + "completions/mean_length": 6127.359375, + "completions/mean_terminated_length": 6127.359375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.8569132760167122, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003845847910270095, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 242698258.0, + "reward": 0.53125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000942945480347, + "sampling/importance_sampling_ratio/min": 0.00043231461313553154, + "sampling/sampling_logp_difference/max": 7.746356964111328, + "sampling/sampling_logp_difference/mean": 0.01856958493590355, + "step": 281 + }, + { + "clip_ratio/high_max": 2.9848330086679198e-05, + "clip_ratio/high_mean": 7.4620825216697995e-06, + "clip_ratio/low_mean": 4.3558867673709756e-05, + "clip_ratio/low_min": 4.417741820361698e-06, + "clip_ratio/region_mean": 5.1020949285884853e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15192.0, + "completions/mean_length": 6600.1484375, + "completions/mean_terminated_length": 6365.33642578125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.78924310952425, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003953634761273861, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 243560957.0, + "reward": 0.5546875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.0006525487406179309, + "sampling/sampling_logp_difference/max": 7.334624767303467, + "sampling/sampling_logp_difference/mean": 0.018097909167408943, + "step": 282 + }, + { + "clip_ratio/high_max": 6.635561703660642e-06, + "clip_ratio/high_mean": 1.6588904259151604e-06, + "clip_ratio/low_mean": 2.737523408313791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9034124281679397e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7852.171875, + "completions/mean_terminated_length": 7852.171875, + "completions/min_length": 1276.0, + "completions/min_terminated_length": 1276.0, + "entropy": 1.0598893761634827, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00360781978815794, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 244585923.0, + "reward": 0.3125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05, + "sampling/sampling_logp_difference/max": 10.076086044311523, + "sampling/sampling_logp_difference/mean": 0.022330068051815033, + "step": 283 + }, + { + "clip_ratio/high_max": 3.1540168947685743e-06, + "clip_ratio/high_mean": 7.885042236921436e-07, + "clip_ratio/low_mean": 4.7973388973332476e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.876189268543385e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7972.2265625, + "completions/mean_terminated_length": 7700.87890625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.933217465877533, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0027661293279379606, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 245628064.0, + "reward": 0.28125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05, + "sampling/sampling_logp_difference/max": 10.366576194763184, + "sampling/sampling_logp_difference/mean": 0.021125148981809616, + "step": 284 + }, + { + "clip_ratio/high_max": 1.2965969062861404e-05, + "clip_ratio/high_mean": 3.241492265715351e-06, + "clip_ratio/low_mean": 4.6317693090713874e-05, + "clip_ratio/low_min": 3.820877282123547e-06, + "clip_ratio/region_mean": 4.955918507221213e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7135.6953125, + "completions/mean_terminated_length": 6913.736328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.7786942347884178, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005680318456143141, + "learning_rate": 1e-05, + "loss": 0.0786, + "num_tokens": 246561329.0, + "reward": 0.4296875, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462366104126, + "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05, + "sampling/sampling_logp_difference/max": 9.737424850463867, + "sampling/sampling_logp_difference/mean": 0.018504241481423378, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.22437145175536e-05, + "clip_ratio/low_min": 1.4025082009538892e-05, + "clip_ratio/region_mean": 4.22437145175536e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6704.046875, + "completions/mean_terminated_length": 6627.82666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 1.0435140281915665, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026402862276881933, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 247437415.0, + "reward": 0.3828125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.0007800163584761322, + "sampling/sampling_logp_difference/max": 7.156195640563965, + "sampling/sampling_logp_difference/mean": 0.02134273201227188, + "step": 286 + }, + { + "clip_ratio/high_max": 2.223430897174694e-05, + "clip_ratio/high_mean": 6.8746438159905665e-06, + "clip_ratio/low_mean": 4.7084630978133646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3959275192028144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 5892.5078125, + "completions/mean_terminated_length": 5725.9765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.8004944771528244, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003993614576756954, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 248211112.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0024652592837810516, + "sampling/sampling_logp_difference/max": 6.005458354949951, + "sampling/sampling_logp_difference/mean": 0.01924925297498703, + "step": 287 + }, + { + "clip_ratio/high_max": 2.1833082200828358e-05, + "clip_ratio/high_mean": 5.458270550207089e-06, + "clip_ratio/low_mean": 3.415995615796419e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961822596920683e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 7812.140625, + "completions/mean_terminated_length": 7316.24755859375, + "completions/min_length": 1515.0, + "completions/min_terminated_length": 1515.0, + "entropy": 0.8841542899608612, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001573400106281042, + "learning_rate": 1e-05, + "loss": 0.0823, + "num_tokens": 249228106.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 0.001001527882181108, + "sampling/sampling_logp_difference/max": 6.906228542327881, + "sampling/sampling_logp_difference/mean": 0.01956877112388611, + "step": 288 + }, + { + "clip_ratio/high_max": 1.014439021673752e-05, + "clip_ratio/high_mean": 2.53609755418438e-06, + "clip_ratio/low_mean": 3.068193461785995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.321803217204433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 6372.953125, + "completions/mean_terminated_length": 6132.6884765625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.8228401988744736, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021125099156051874, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 250063284.0, + "reward": 0.5, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05, + "sampling/sampling_logp_difference/max": 9.937475204467773, + "sampling/sampling_logp_difference/mean": 0.01943521574139595, + "step": 289 + }, + { + "clip_ratio/high_max": 7.023906164249638e-06, + "clip_ratio/high_mean": 1.7559765410624095e-06, + "clip_ratio/low_mean": 2.526416994896863e-05, + "clip_ratio/low_min": 6.7760895490209805e-06, + "clip_ratio/region_mean": 2.7020146660561295e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16270.0, + "completions/mean_length": 7817.8671875, + "completions/mean_terminated_length": 7396.58154296875, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.9454319775104523, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022315154783427715, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 251085123.0, + "reward": 0.40625, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06, + "sampling/sampling_logp_difference/max": 12.760490417480469, + "sampling/sampling_logp_difference/mean": 0.021764669567346573, + "step": 290 + }, + { + "clip_ratio/high_max": 1.4797966287005693e-05, + "clip_ratio/high_mean": 3.699491571751423e-06, + "clip_ratio/low_mean": 4.36271948274225e-05, + "clip_ratio/low_min": 3.6957101201551268e-06, + "clip_ratio/region_mean": 4.732668639917392e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 7168.4921875, + "completions/mean_terminated_length": 6635.36328125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8433891162276268, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 252020906.0, + "reward": 0.5546875, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.0003851866349577904, + "sampling/sampling_logp_difference/max": 7.861782550811768, + "sampling/sampling_logp_difference/mean": 0.01929781585931778, + "step": 291 + }, + { + "clip_ratio/high_max": 1.996871560550062e-05, + "clip_ratio/high_mean": 6.089093403716106e-06, + "clip_ratio/low_mean": 4.2792244585143635e-05, + "clip_ratio/low_min": 1.0337215371691855e-05, + "clip_ratio/region_mean": 4.8881338216233416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7322.5078125, + "completions/mean_terminated_length": 6876.8603515625, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 0.9157031401991844, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036942458245903254, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 252977435.0, + "reward": 0.3359375, + "reward_std": 0.24275577068328857, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.00029605376766994596, + "sampling/sampling_logp_difference/max": 8.124969482421875, + "sampling/sampling_logp_difference/mean": 0.0205365102738142, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.631919460327481e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.631919460327481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16078.0, + "completions/mean_length": 7025.484375, + "completions/mean_terminated_length": 6723.5966796875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 1.1329731941223145, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034127074759453535, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 253896161.0, + "reward": 0.25, + "reward_std": 0.27722424268722534, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0005197672289796174, + "sampling/sampling_logp_difference/max": 7.562129497528076, + "sampling/sampling_logp_difference/mean": 0.023741140961647034, + "step": 293 + }, + { + "clip_ratio/high_max": 4.368643658381188e-06, + "clip_ratio/high_mean": 1.092160914595297e-06, + "clip_ratio/low_mean": 2.4661783299961826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5753944555617636e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13776.0, + "completions/mean_length": 5996.1796875, + "completions/mean_terminated_length": 5661.08837890625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8773328885436058, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003959407564252615, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 254690264.0, + "reward": 0.53125, + "reward_std": 0.26645541191101074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07, + "sampling/sampling_logp_difference/max": 15.73043155670166, + "sampling/sampling_logp_difference/mean": 0.018407585099339485, + "step": 294 + }, + { + "clip_ratio/high_max": 1.616483677935321e-05, + "clip_ratio/high_mean": 4.041209194838302e-06, + "clip_ratio/low_mean": 3.736187466074625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140308453770558e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7165.328125, + "completions/mean_terminated_length": 6867.951171875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.9502597972750664, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030910037457942963, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 255626394.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000731945037842, + "sampling/importance_sampling_ratio/min": 0.00022311302018351853, + "sampling/sampling_logp_difference/max": 8.407832145690918, + "sampling/sampling_logp_difference/mean": 0.020668907091021538, + "step": 295 + }, + { + "clip_ratio/high_max": 1.1702686606440693e-05, + "clip_ratio/high_mean": 2.9256716516101733e-06, + "clip_ratio/low_mean": 5.5247357522603124e-05, + "clip_ratio/low_min": 3.6811261452385224e-06, + "clip_ratio/region_mean": 5.8173028264718596e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15375.0, + "completions/mean_length": 8001.9296875, + "completions/mean_terminated_length": 7661.34912109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8591345250606537, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037233952898532152, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 256673457.0, + "reward": 0.421875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.0021876997780054808, + "sampling/sampling_logp_difference/max": 6.124904632568359, + "sampling/sampling_logp_difference/mean": 0.020540472120046616, + "step": 296 + }, + { + "clip_ratio/high_max": 3.721341136042611e-05, + "clip_ratio/high_mean": 1.2759249216287571e-05, + "clip_ratio/low_mean": 3.570647322703735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.846572301175911e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 6924.84375, + "completions/mean_terminated_length": 6697.82421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.7969356626272202, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006054217461496592, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 257578501.0, + "reward": 0.5078125, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.007889713160693645, + "sampling/sampling_logp_difference/max": 4.842195510864258, + "sampling/sampling_logp_difference/mean": 0.019306108355522156, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0211543894911301e-05, + "clip_ratio/high_mean": 2.5528859737278253e-06, + "clip_ratio/low_mean": 5.2388056587915344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4940942732173426e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14439.0, + "completions/mean_length": 6203.03125, + "completions/mean_terminated_length": 5958.6884765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.8734413683414459, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004903806839138269, + "learning_rate": 1e-05, + "loss": 0.0689, + "num_tokens": 258392625.0, + "reward": 0.4453125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 0.00020370795391499996, + "sampling/sampling_logp_difference/max": 8.498823165893555, + "sampling/sampling_logp_difference/mean": 0.01909301057457924, + "step": 298 + }, + { + "clip_ratio/high_max": 1.5135058674786706e-05, + "clip_ratio/high_mean": 4.64845766146027e-06, + "clip_ratio/low_mean": 4.373456977191381e-05, + "clip_ratio/low_min": 3.670856358439778e-06, + "clip_ratio/region_mean": 4.8383026296505705e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 7982.5390625, + "completions/mean_terminated_length": 7641.01611328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0091779381036758, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033637424930930138, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 259435270.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999765753746033, + "sampling/importance_sampling_ratio/min": 0.0016514655435457826, + "sampling/sampling_logp_difference/max": 6.406092166900635, + "sampling/sampling_logp_difference/mean": 0.02182736061513424, + "step": 299 + }, + { + "clip_ratio/high_max": 2.3964702677403693e-05, + "clip_ratio/high_mean": 5.991175669350923e-06, + "clip_ratio/low_mean": 5.2442986770984135e-05, + "clip_ratio/low_min": 8.75736759553547e-06, + "clip_ratio/region_mean": 5.843416238349164e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6915.3125, + "completions/mean_terminated_length": 6688.064453125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.7964543774724007, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052203768864274025, + "learning_rate": 1e-05, + "loss": 0.144, + "num_tokens": 260337614.0, + "reward": 0.46875, + "reward_std": 0.37928223609924316, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 7.032832218101248e-05, + "sampling/sampling_logp_difference/max": 9.562335968017578, + "sampling/sampling_logp_difference/mean": 0.017896221950650215, + "step": 300 + }, + { + "clip_ratio/high_max": 4.458271632756805e-05, + "clip_ratio/high_mean": 1.1145679081892013e-05, + "clip_ratio/low_mean": 6.243192206056847e-05, + "clip_ratio/low_min": 1.2397775662975619e-05, + "clip_ratio/region_mean": 7.357759886872373e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7029.4375, + "completions/mean_terminated_length": 6880.95263671875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.8605096861720085, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005570738110691309, + "learning_rate": 1e-05, + "loss": 0.0984, + "num_tokens": 261254070.0, + "reward": 0.4765625, + "reward_std": 0.3327290117740631, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999494552612305, + "sampling/importance_sampling_ratio/min": 0.0009070249507203698, + "sampling/sampling_logp_difference/max": 7.005340576171875, + "sampling/sampling_logp_difference/mean": 0.01905740052461624, + "step": 301 + }, + { + "clip_ratio/high_max": 3.390461233720998e-05, + "clip_ratio/high_mean": 1.1191766247975465e-05, + "clip_ratio/low_mean": 7.46641262594494e-05, + "clip_ratio/low_min": 5.041745680500753e-06, + "clip_ratio/region_mean": 8.585589102949598e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5858.84375, + "completions/mean_terminated_length": 5606.240234375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8430554121732712, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004496110137552023, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 262024906.0, + "reward": 0.4453125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294877052307, + "sampling/importance_sampling_ratio/min": 0.00040469475788995624, + "sampling/sampling_logp_difference/max": 7.812377452850342, + "sampling/sampling_logp_difference/mean": 0.019225869327783585, + "step": 302 + }, + { + "clip_ratio/high_max": 3.2563955301156966e-06, + "clip_ratio/high_mean": 8.140988825289242e-07, + "clip_ratio/low_mean": 3.7080020149460324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.789411886145899e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15976.0, + "completions/mean_length": 8337.328125, + "completions/mean_terminated_length": 7728.7568359375, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.901745393872261, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00348713924176991, + "learning_rate": 1e-05, + "loss": -0.0002, + "num_tokens": 263110844.0, + "reward": 0.296875, + "reward_std": 0.20805485546588898, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.0022652465850114822, + "sampling/sampling_logp_difference/max": 6.090071678161621, + "sampling/sampling_logp_difference/mean": 0.02157524600625038, + "step": 303 + }, + { + "clip_ratio/high_max": 2.3739744847262045e-05, + "clip_ratio/high_mean": 5.934936211815511e-06, + "clip_ratio/low_mean": 2.823553325015382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.417046866616147e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7084.7265625, + "completions/mean_terminated_length": 6381.42041015625, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8265534415841103, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003980033565312624, + "learning_rate": 1e-05, + "loss": 0.0551, + "num_tokens": 264036169.0, + "reward": 0.3984375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673366546631, + "sampling/importance_sampling_ratio/min": 0.00012345099821686745, + "sampling/sampling_logp_difference/max": 8.999666213989258, + "sampling/sampling_logp_difference/mean": 0.018782664090394974, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1745505617000163e-05, + "clip_ratio/high_mean": 3.771558226617344e-06, + "clip_ratio/low_mean": 6.913120819262986e-05, + "clip_ratio/low_min": 2.494283216947224e-05, + "clip_ratio/region_mean": 7.290276607818669e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6543.796875, + "completions/mean_terminated_length": 6543.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8899869695305824, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.006467343773692846, + "learning_rate": 1e-05, + "loss": 0.1139, + "num_tokens": 264892767.0, + "reward": 0.484375, + "reward_std": 0.3934885561466217, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000489950180054, + "sampling/importance_sampling_ratio/min": 9.891482477542013e-05, + "sampling/sampling_logp_difference/max": 9.221251487731934, + "sampling/sampling_logp_difference/mean": 0.02032080665230751, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.395576979732141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.395576979732141e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16307.0, + "completions/mean_length": 8483.390625, + "completions/mean_terminated_length": 7813.84765625, + "completions/min_length": 1342.0, + "completions/min_terminated_length": 1342.0, + "entropy": 0.9621479511260986, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003174177836626768, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 265995697.0, + "reward": 0.3359375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.0005628522485494614, + "sampling/sampling_logp_difference/max": 7.4824934005737305, + "sampling/sampling_logp_difference/mean": 0.02145479805767536, + "step": 306 + }, + { + "clip_ratio/high_max": 1.2596524811669951e-05, + "clip_ratio/high_mean": 3.149131202917488e-06, + "clip_ratio/low_mean": 3.7911659774181317e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.106079018129094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14985.0, + "completions/mean_length": 7184.578125, + "completions/mean_terminated_length": 6963.79248046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9993807673454285, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003356153378263116, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 266937707.0, + "reward": 0.3828125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.0017036627978086472, + "sampling/sampling_logp_difference/max": 6.374974727630615, + "sampling/sampling_logp_difference/mean": 0.02204768732190132, + "step": 307 + }, + { + "clip_ratio/high_max": 1.9245163684900035e-05, + "clip_ratio/high_mean": 4.811290921225009e-06, + "clip_ratio/low_mean": 4.8845648166206956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.365693925796222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16216.0, + "completions/mean_length": 7029.2265625, + "completions/mean_terminated_length": 6727.45947265625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.9139953926205635, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006375293247401714, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 267853880.0, + "reward": 0.4765625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.010649868287146091, + "sampling/sampling_logp_difference/max": 4.542207717895508, + "sampling/sampling_logp_difference/mean": 0.020365029573440552, + "step": 308 + }, + { + "clip_ratio/high_max": 4.812504812434781e-06, + "clip_ratio/high_mean": 1.2031262031086953e-06, + "clip_ratio/low_mean": 2.5999243803198624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.720237000630732e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6188.0078125, + "completions/mean_terminated_length": 5943.30419921875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.7640773430466652, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003697809297591448, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 268665721.0, + "reward": 0.5078125, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372363090515, + "sampling/importance_sampling_ratio/min": 0.02927250787615776, + "sampling/sampling_logp_difference/max": 3.531106472015381, + "sampling/sampling_logp_difference/mean": 0.016581017524003983, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1358927824621787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1358927824621787e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 8128.21875, + "completions/mean_terminated_length": 7861.90283203125, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.8218234181404114, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002286596456542611, + "learning_rate": 1e-05, + "loss": 0.0763, + "num_tokens": 269726181.0, + "reward": 0.375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798536300659, + "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06, + "sampling/sampling_logp_difference/max": 12.90043830871582, + "sampling/sampling_logp_difference/mean": 0.019403984770178795, + "step": 310 + }, + { + "clip_ratio/high_max": 1.4808477317274082e-05, + "clip_ratio/high_mean": 3.7021193293185206e-06, + "clip_ratio/low_mean": 3.0363167581981543e-05, + "clip_ratio/low_min": 6.364238288369961e-06, + "clip_ratio/region_mean": 3.4065286854456645e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 5673.3359375, + "completions/mean_terminated_length": 5503.32568359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.9275510385632515, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00485506234690547, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 270470616.0, + "reward": 0.4921875, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.0009123464697040617, + "sampling/sampling_logp_difference/max": 6.999490737915039, + "sampling/sampling_logp_difference/mean": 0.01881871558725834, + "step": 311 + }, + { + "clip_ratio/high_max": 1.1274602456978755e-05, + "clip_ratio/high_mean": 3.6739949109687586e-06, + "clip_ratio/low_mean": 3.968570712231667e-05, + "clip_ratio/low_min": 3.4213767321489286e-06, + "clip_ratio/region_mean": 4.335970191959859e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 6944.8984375, + "completions/mean_terminated_length": 6795.07177734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9335741624236107, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005874342750757933, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 271377723.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000594854354858, + "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05, + "sampling/sampling_logp_difference/max": 10.049861907958984, + "sampling/sampling_logp_difference/mean": 0.020590776577591896, + "step": 312 + }, + { + "clip_ratio/high_max": 1.264126694877632e-05, + "clip_ratio/high_mean": 3.16031673719408e-06, + "clip_ratio/low_mean": 3.206376845810155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.522408474054828e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7705.625, + "completions/mean_terminated_length": 7278.8193359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.8491624072194099, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001684082904830575, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 272384891.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 6.605865200981498e-05, + "sampling/sampling_logp_difference/max": 9.624967575073242, + "sampling/sampling_logp_difference/mean": 0.020136822015047073, + "step": 313 + }, + { + "clip_ratio/high_max": 9.772357770998497e-06, + "clip_ratio/high_mean": 2.443089442749624e-06, + "clip_ratio/low_mean": 3.8573590472879005e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101667946088128e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6611.1484375, + "completions/mean_terminated_length": 6534.19677734375, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "entropy": 0.8867302760481834, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003692191792652011, + "learning_rate": 1e-05, + "loss": 0.1233, + "num_tokens": 273251630.0, + "reward": 0.3984375, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.0031062732450664043, + "sampling/sampling_logp_difference/max": 5.774331569671631, + "sampling/sampling_logp_difference/mean": 0.019237037748098373, + "step": 314 + }, + { + "clip_ratio/high_max": 3.0103737344688852e-05, + "clip_ratio/high_mean": 9.664363972206047e-06, + "clip_ratio/low_mean": 1.7575501146893657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.723986426644842e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15786.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 6770.46875, + "completions/mean_terminated_length": 6770.46875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.8252957463264465, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004167635925114155, + "learning_rate": 1e-05, + "loss": -0.0072, + "num_tokens": 274146482.0, + "reward": 0.5703125, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.00010247006866848096, + "sampling/sampling_logp_difference/max": 9.18593978881836, + "sampling/sampling_logp_difference/mean": 0.019684650003910065, + "step": 315 + }, + { + "clip_ratio/high_max": 6.529460733872838e-06, + "clip_ratio/high_mean": 1.6323651834682096e-06, + "clip_ratio/low_mean": 3.877351048231503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.040587566578324e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15827.0, + "completions/mean_length": 8210.859375, + "completions/mean_terminated_length": 7365.36181640625, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.8118235394358635, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030363225378096104, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 275214040.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998943209648132, + "sampling/importance_sampling_ratio/min": 0.002854935359209776, + "sampling/sampling_logp_difference/max": 5.858705997467041, + "sampling/sampling_logp_difference/mean": 0.019275270402431488, + "step": 316 + }, + { + "clip_ratio/high_max": 7.0800629146106075e-06, + "clip_ratio/high_mean": 1.7700157286526519e-06, + "clip_ratio/low_mean": 2.3981688286767167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5751703674359305e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14900.0, + "completions/mean_length": 7072.8828125, + "completions/mean_terminated_length": 6849.41650390625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8018335327506065, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004777858033776283, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 276138049.0, + "reward": 0.453125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 0.0028502768836915493, + "sampling/sampling_logp_difference/max": 5.860339164733887, + "sampling/sampling_logp_difference/mean": 0.01849908009171486, + "step": 317 + }, + { + "clip_ratio/high_max": 2.259368602608447e-05, + "clip_ratio/high_mean": 5.648421506521117e-06, + "clip_ratio/low_mean": 4.28424866640853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.849090737479855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14447.0, + "completions/mean_length": 5889.8359375, + "completions/mean_terminated_length": 5723.26220703125, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.7976400703191757, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030593445990234613, + "learning_rate": 1e-05, + "loss": 0.1331, + "num_tokens": 276910124.0, + "reward": 0.5859375, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.000139843366923742, + "sampling/sampling_logp_difference/max": 8.874987602233887, + "sampling/sampling_logp_difference/mean": 0.01834402233362198, + "step": 318 + }, + { + "clip_ratio/high_max": 1.4654247024736833e-05, + "clip_ratio/high_mean": 3.663561756184208e-06, + "clip_ratio/low_mean": 2.377464920755301e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7438210736363544e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 7144.265625, + "completions/mean_terminated_length": 6689.85205078125, + "completions/min_length": 1200.0, + "completions/min_terminated_length": 1200.0, + "entropy": 0.8309404999017715, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004245694726705551, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 277843542.0, + "reward": 0.4453125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998534321784973, + "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05, + "sampling/sampling_logp_difference/max": 11.499897956848145, + "sampling/sampling_logp_difference/mean": 0.01875344291329384, + "step": 319 + }, + { + "clip_ratio/high_max": 6.252500952541595e-06, + "clip_ratio/high_mean": 2.241558604509919e-06, + "clip_ratio/low_mean": 4.735765514851664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9599213525652885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15722.0, + "completions/mean_length": 6779.5234375, + "completions/mean_terminated_length": 6703.8974609375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.9584890529513359, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035574575886130333, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 278730129.0, + "reward": 0.3984375, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.005792221520096064, + "sampling/sampling_logp_difference/max": 5.151239395141602, + "sampling/sampling_logp_difference/mean": 0.02137477695941925, + "step": 320 + }, + { + "clip_ratio/high_max": 3.2948471016425174e-05, + "clip_ratio/high_mean": 9.518853403278627e-06, + "clip_ratio/low_mean": 2.195712454522436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.14759782895635e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15892.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 5582.9765625, + "completions/mean_terminated_length": 5582.9765625, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8629376217722893, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037982752546668053, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 279462542.0, + "reward": 0.5546875, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999780058860779, + "sampling/importance_sampling_ratio/min": 0.0021874974481761456, + "sampling/sampling_logp_difference/max": 6.124997138977051, + "sampling/sampling_logp_difference/mean": 0.01906203106045723, + "step": 321 + }, + { + "clip_ratio/high_max": 1.1029473625967512e-05, + "clip_ratio/high_mean": 2.757368406491878e-06, + "clip_ratio/low_mean": 5.367386921761863e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6431237737797346e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 6942.2578125, + "completions/mean_terminated_length": 6477.90966796875, + "completions/min_length": 1156.0, + "completions/min_terminated_length": 1156.0, + "entropy": 0.8147861957550049, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027678858023136854, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 280370207.0, + "reward": 0.4375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998471736907959, + "sampling/importance_sampling_ratio/min": 0.00023058800434228033, + "sampling/sampling_logp_difference/max": 8.3748779296875, + "sampling/sampling_logp_difference/mean": 0.01940828748047352, + "step": 322 + }, + { + "clip_ratio/high_max": 2.6367894406575942e-05, + "clip_ratio/high_mean": 8.765707434577052e-06, + "clip_ratio/low_mean": 3.232976985145797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.109547796815605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6242.53125, + "completions/mean_terminated_length": 5915.38671875, + "completions/min_length": 1220.0, + "completions/min_terminated_length": 1220.0, + "entropy": 0.878915011882782, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00577945914119482, + "learning_rate": 1e-05, + "loss": 0.0839, + "num_tokens": 281189491.0, + "reward": 0.515625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 9.611724817659706e-05, + "sampling/sampling_logp_difference/max": 9.2499418258667, + "sampling/sampling_logp_difference/mean": 0.01948760263621807, + "step": 323 + }, + { + "clip_ratio/high_max": 3.50839609382092e-05, + "clip_ratio/high_mean": 1.1664920634757436e-05, + "clip_ratio/low_mean": 1.833109013205103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9996010880495305e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 7004.015625, + "completions/mean_terminated_length": 6622.71533203125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.7964659407734871, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014128695474937558, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 282103997.0, + "reward": 0.4140625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.0024504722096025944, + "sampling/sampling_logp_difference/max": 6.011474609375, + "sampling/sampling_logp_difference/mean": 0.019019678235054016, + "step": 324 + }, + { + "clip_ratio/high_max": 1.832260545597819e-05, + "clip_ratio/high_mean": 4.580651363994548e-06, + "clip_ratio/low_mean": 5.309064226821647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.767129368905444e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7822.6953125, + "completions/mean_terminated_length": 7546.52392578125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.8571138679981232, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002476039342582226, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 283122382.0, + "reward": 0.4609375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.0009774373611435294, + "sampling/sampling_logp_difference/max": 6.930576324462891, + "sampling/sampling_logp_difference/mean": 0.020557202398777008, + "step": 325 + }, + { + "clip_ratio/high_max": 5.738419986300869e-06, + "clip_ratio/high_mean": 1.4346049965752172e-06, + "clip_ratio/low_mean": 4.19679121819172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3402517292179255e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7738.8984375, + "completions/mean_terminated_length": 6844.57763671875, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "entropy": 0.7839021533727646, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005309853237122297, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 284130081.0, + "reward": 0.5234375, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998971223831177, + "sampling/importance_sampling_ratio/min": 0.0001319014554610476, + "sampling/sampling_logp_difference/max": 8.933455467224121, + "sampling/sampling_logp_difference/mean": 0.01873316988348961, + "step": 326 + }, + { + "clip_ratio/high_max": 1.007085802484653e-05, + "clip_ratio/high_mean": 2.5177145062116324e-06, + "clip_ratio/low_mean": 4.043528815600439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.295300277590286e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15952.0, + "completions/mean_length": 7102.2421875, + "completions/mean_terminated_length": 6954.9130859375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8530801385641098, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228116944432259, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 285058720.0, + "reward": 0.5078125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00012956927821505815, + "sampling/sampling_logp_difference/max": 8.951294898986816, + "sampling/sampling_logp_difference/mean": 0.019325006753206253, + "step": 327 + }, + { + "clip_ratio/high_max": 4.06874551117653e-06, + "clip_ratio/high_mean": 1.0171863777941326e-06, + "clip_ratio/low_mean": 3.661125703047219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.762844340826632e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15594.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6583.4765625, + "completions/mean_terminated_length": 6583.4765625, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.021921381354332, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004967439454048872, + "learning_rate": 1e-05, + "loss": 0.0374, + "num_tokens": 285919765.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.016675354912877083, + "sampling/sampling_logp_difference/max": 4.093823432922363, + "sampling/sampling_logp_difference/mean": 0.021393200382590294, + "step": 328 + }, + { + "clip_ratio/high_max": 1.2215251445013564e-05, + "clip_ratio/high_mean": 3.053812861253391e-06, + "clip_ratio/low_mean": 4.05305947879242e-05, + "clip_ratio/low_min": 4.215567059873138e-06, + "clip_ratio/region_mean": 4.358440742180392e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16299.0, + "completions/mean_length": 7770.5859375, + "completions/mean_terminated_length": 7346.97509765625, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "entropy": 1.0466903448104858, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004189736675471067, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 286935512.0, + "reward": 0.3828125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.011683559976518154, + "sampling/sampling_logp_difference/max": 4.449572563171387, + "sampling/sampling_logp_difference/mean": 0.021805983036756516, + "step": 329 + }, + { + "clip_ratio/high_max": 2.0567378214764176e-05, + "clip_ratio/high_mean": 5.141844553691044e-06, + "clip_ratio/low_mean": 1.8177100628236076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3318944840866607e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15758.0, + "completions/mean_length": 5689.2421875, + "completions/mean_terminated_length": 5432.568359375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.7778806164860725, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032866497058421373, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 287681943.0, + "reward": 0.640625, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940812587738, + "sampling/importance_sampling_ratio/min": 0.00038077132194302976, + "sampling/sampling_logp_difference/max": 7.873311519622803, + "sampling/sampling_logp_difference/mean": 0.01789461076259613, + "step": 330 + }, + { + "clip_ratio/high_max": 3.109086901531555e-05, + "clip_ratio/high_mean": 7.772717253828887e-06, + "clip_ratio/low_mean": 3.1423560130861006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919627738468989e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13820.0, + "completions/mean_length": 6288.1875, + "completions/mean_terminated_length": 6127.93701171875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.7709921672940254, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023572889622300863, + "learning_rate": 1e-05, + "loss": 0.0746, + "num_tokens": 288506735.0, + "reward": 0.484375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 0.000430915504693985, + "sampling/sampling_logp_difference/max": 7.749598503112793, + "sampling/sampling_logp_difference/mean": 0.017407266423106194, + "step": 331 + }, + { + "clip_ratio/high_max": 3.4638953366084024e-05, + "clip_ratio/high_mean": 9.51674803673086e-06, + "clip_ratio/low_mean": 6.26047980176736e-05, + "clip_ratio/low_min": 5.51267930859467e-06, + "clip_ratio/region_mean": 7.212154741864651e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 6775.0234375, + "completions/mean_terminated_length": 6465.05615234375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9338318258523941, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034220058005303144, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 289395498.0, + "reward": 0.390625, + "reward_std": 0.34533774852752686, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.0317598432302475, + "sampling/sampling_logp_difference/max": 3.449552536010742, + "sampling/sampling_logp_difference/mean": 0.019930530339479446, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.159989991123439e-05, + "clip_ratio/low_min": 1.5592839645250933e-05, + "clip_ratio/region_mean": 7.159989991123439e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 7142.9375, + "completions/mean_terminated_length": 6844.83837890625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.971405878663063, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002513247774913907, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 290329082.0, + "reward": 0.328125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 3.152207455059397e-07, + "sampling/sampling_logp_difference/max": 14.969992637634277, + "sampling/sampling_logp_difference/mean": 0.022366533055901527, + "step": 333 + }, + { + "clip_ratio/high_max": 1.6507752206962323e-05, + "clip_ratio/high_mean": 4.126938051740581e-06, + "clip_ratio/low_mean": 1.7493430505055585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1620368215735652e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15581.0, + "completions/mean_length": 6412.2109375, + "completions/mean_terminated_length": 6333.69287109375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "entropy": 0.9136044681072235, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0056767817586660385, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 291170133.0, + "reward": 0.421875, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999720454216003, + "sampling/importance_sampling_ratio/min": 0.000458698661532253, + "sampling/sampling_logp_difference/max": 7.687117099761963, + "sampling/sampling_logp_difference/mean": 0.020012658089399338, + "step": 334 + }, + { + "clip_ratio/high_max": 8.26085442895419e-06, + "clip_ratio/high_mean": 2.0652136072385474e-06, + "clip_ratio/low_mean": 3.6938338666914206e-05, + "clip_ratio/low_min": 5.699044777429663e-06, + "clip_ratio/region_mean": 3.900355193309224e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16111.0, + "completions/mean_length": 8066.1015625, + "completions/mean_terminated_length": 7797.7822265625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 1.0789504647254944, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00243841833434999, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 292222082.0, + "reward": 0.3046875, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999664425849915, + "sampling/importance_sampling_ratio/min": 8.481895929435268e-05, + "sampling/sampling_logp_difference/max": 9.374991416931152, + "sampling/sampling_logp_difference/mean": 0.023650091141462326, + "step": 335 + }, + { + "clip_ratio/high_max": 5.320054697222076e-06, + "clip_ratio/high_mean": 1.330013674305519e-06, + "clip_ratio/low_mean": 1.9117383317279746e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0447396991585265e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15176.0, + "completions/mean_length": 6836.046875, + "completions/mean_terminated_length": 6606.896484375, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 1.218759760260582, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020856577903032303, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 293115984.0, + "reward": 0.21875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 2.784526441246271e-05, + "sampling/sampling_logp_difference/max": 10.488847732543945, + "sampling/sampling_logp_difference/mean": 0.022012067958712578, + "step": 336 + }, + { + "clip_ratio/high_max": 2.5695502699818462e-05, + "clip_ratio/high_mean": 7.549717793153832e-06, + "clip_ratio/low_mean": 4.6741323160404136e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.429104089671455e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7501.9921875, + "completions/mean_terminated_length": 7140.9345703125, + "completions/min_length": 1237.0, + "completions/min_terminated_length": 1237.0, + "entropy": 0.8940394818782806, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005163854919373989, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 294099503.0, + "reward": 0.328125, + "reward_std": 0.30904707312583923, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999276399612427, + "sampling/importance_sampling_ratio/min": 0.0006545600481331348, + "sampling/sampling_logp_difference/max": 7.331547260284424, + "sampling/sampling_logp_difference/mean": 0.020813245326280594, + "step": 337 + }, + { + "clip_ratio/high_max": 3.1606674838258186e-05, + "clip_ratio/high_mean": 9.45794374729303e-06, + "clip_ratio/low_mean": 4.5567895540443715e-05, + "clip_ratio/low_min": 4.458871444512624e-06, + "clip_ratio/region_mean": 5.502583962879726e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7204.828125, + "completions/mean_terminated_length": 6908.7255859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.9961872175335884, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029277894645929337, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 295042105.0, + "reward": 0.390625, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000677108764648, + "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05, + "sampling/sampling_logp_difference/max": 10.872637748718262, + "sampling/sampling_logp_difference/mean": 0.020187582820653915, + "step": 338 + }, + { + "clip_ratio/high_max": 1.7963964182854397e-05, + "clip_ratio/high_mean": 5.194059781388205e-06, + "clip_ratio/low_mean": 1.8380221035840805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.357428081722901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15856.0, + "completions/mean_length": 6256.859375, + "completions/mean_terminated_length": 6013.80810546875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "entropy": 0.9293600022792816, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032952844630926847, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 295867039.0, + "reward": 0.46875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999649524688721, + "sampling/importance_sampling_ratio/min": 7.995560008566827e-05, + "sampling/sampling_logp_difference/max": 9.434039115905762, + "sampling/sampling_logp_difference/mean": 0.019491540268063545, + "step": 339 + }, + { + "clip_ratio/high_max": 7.577551059512189e-06, + "clip_ratio/high_mean": 1.8943877648780472e-06, + "clip_ratio/low_mean": 2.7479814093567256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9374201631071628e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15412.0, + "completions/mean_length": 7397.84375, + "completions/mean_terminated_length": 7032.552734375, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.8508890569210052, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029417150653898716, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 296832843.0, + "reward": 0.375, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000183582305908, + "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05, + "sampling/sampling_logp_difference/max": 10.93724250793457, + "sampling/sampling_logp_difference/mean": 0.01975393109023571, + "step": 340 + }, + { + "clip_ratio/high_max": 3.281225508544594e-05, + "clip_ratio/high_mean": 1.3302957199812226e-05, + "clip_ratio/low_mean": 5.109179869577929e-05, + "clip_ratio/low_min": 6.657612175331451e-06, + "clip_ratio/region_mean": 6.439475532715733e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6897.765625, + "completions/mean_terminated_length": 6823.07080078125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9046694040298462, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026788609102368355, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 297735285.0, + "reward": 0.421875, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.001710799871943891, + "sampling/sampling_logp_difference/max": 6.370794296264648, + "sampling/sampling_logp_difference/mean": 0.020578179508447647, + "step": 341 + }, + { + "clip_ratio/high_max": 1.7319889593636617e-05, + "clip_ratio/high_mean": 5.168538336874917e-06, + "clip_ratio/low_mean": 7.019768918326008e-05, + "clip_ratio/low_min": 2.541147478041239e-05, + "clip_ratio/region_mean": 7.53662266106403e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 6971.9921875, + "completions/mean_terminated_length": 6509.10595703125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8658201694488525, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005915141198784113, + "learning_rate": 1e-05, + "loss": 0.0923, + "num_tokens": 298645124.0, + "reward": 0.3984375, + "reward_std": 0.3742823898792267, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999268651008606, + "sampling/importance_sampling_ratio/min": 0.000970841443631798, + "sampling/sampling_logp_difference/max": 6.937347412109375, + "sampling/sampling_logp_difference/mean": 0.01906151883304119, + "step": 342 + }, + { + "clip_ratio/high_max": 1.8332865238335216e-05, + "clip_ratio/high_mean": 4.583216309583804e-06, + "clip_ratio/low_mean": 6.167940273371642e-05, + "clip_ratio/low_min": 5.969151516183047e-06, + "clip_ratio/region_mean": 6.626261847486603e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15054.0, + "completions/mean_length": 6545.6953125, + "completions/mean_terminated_length": 5889.80859375, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.779609851539135, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032792428974062204, + "learning_rate": 1e-05, + "loss": 0.097, + "num_tokens": 299503781.0, + "reward": 0.609375, + "reward_std": 0.38293448090553284, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999361634254456, + "sampling/importance_sampling_ratio/min": 0.002187495119869709, + "sampling/sampling_logp_difference/max": 6.124998092651367, + "sampling/sampling_logp_difference/mean": 0.017413027584552765, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.46246323235755e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.46246323235755e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7226.515625, + "completions/mean_terminated_length": 7006.736328125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9573849961161613, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005092279519885778, + "learning_rate": 1e-05, + "loss": 0.1102, + "num_tokens": 300447903.0, + "reward": 0.5390625, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999373555183411, + "sampling/importance_sampling_ratio/min": 0.000627054600045085, + "sampling/sampling_logp_difference/max": 7.374476909637451, + "sampling/sampling_logp_difference/mean": 0.021570835262537003, + "step": 344 + }, + { + "clip_ratio/high_max": 5.487269390869187e-06, + "clip_ratio/high_mean": 1.3718173477172968e-06, + "clip_ratio/low_mean": 4.7280102080549113e-05, + "clip_ratio/low_min": 1.0166083029616857e-05, + "clip_ratio/region_mean": 4.865191931457957e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14967.0, + "completions/mean_length": 5755.171875, + "completions/mean_terminated_length": 5323.10546875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8482184633612633, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005033228080719709, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 301206021.0, + "reward": 0.390625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.0014573346124961972, + "sampling/sampling_logp_difference/max": 6.531146049499512, + "sampling/sampling_logp_difference/mean": 0.018870476633310318, + "step": 345 + }, + { + "clip_ratio/high_max": 5.421346941147931e-06, + "clip_ratio/high_mean": 1.3553367352869827e-06, + "clip_ratio/low_mean": 1.6510994441887306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.786633117717429e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 7098.7265625, + "completions/mean_terminated_length": 6875.88037109375, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "entropy": 0.87320177257061, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007659573573619127, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 302133890.0, + "reward": 0.421875, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0012466582702472806, + "sampling/sampling_logp_difference/max": 6.687288761138916, + "sampling/sampling_logp_difference/mean": 0.019994346424937248, + "step": 346 + }, + { + "clip_ratio/high_max": 1.1556229310372146e-05, + "clip_ratio/high_mean": 2.8890573275930365e-06, + "clip_ratio/low_mean": 3.8744643916288624e-05, + "clip_ratio/low_min": 6.108287834649673e-06, + "clip_ratio/region_mean": 4.1633702039689524e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16139.0, + "completions/mean_length": 6399.96875, + "completions/mean_terminated_length": 6077.90283203125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9481896534562111, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014135175151750445, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 302972566.0, + "reward": 0.4140625, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0025698256213217974, + "sampling/sampling_logp_difference/max": 5.963917255401611, + "sampling/sampling_logp_difference/mean": 0.02073008380830288, + "step": 347 + }, + { + "clip_ratio/high_max": 6.59491388432798e-06, + "clip_ratio/high_mean": 2.545892130001448e-06, + "clip_ratio/low_mean": 4.620846755187813e-05, + "clip_ratio/low_min": 6.243132702365983e-06, + "clip_ratio/region_mean": 4.875435956819274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 7298.078125, + "completions/mean_terminated_length": 7226.53564453125, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "entropy": 0.8719206526875496, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027898226398974657, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 303925976.0, + "reward": 0.484375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.005236432887613773, + "sampling/sampling_logp_difference/max": 5.252114772796631, + "sampling/sampling_logp_difference/mean": 0.020944103598594666, + "step": 348 + }, + { + "clip_ratio/high_max": 1.052124343914329e-05, + "clip_ratio/high_mean": 2.6303108597858227e-06, + "clip_ratio/low_mean": 2.010384196182713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.273415248055244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14980.0, + "completions/mean_length": 5667.0390625, + "completions/mean_terminated_length": 5496.9287109375, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "entropy": 0.8791451379656792, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012764945859089494, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 304675157.0, + "reward": 0.390625, + "reward_std": 0.17965976893901825, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000383853912354, + "sampling/importance_sampling_ratio/min": 5.054428584116977e-06, + "sampling/sampling_logp_difference/max": 12.195245742797852, + "sampling/sampling_logp_difference/mean": 0.018928447738289833, + "step": 349 + }, + { + "clip_ratio/high_max": 9.578045592206763e-06, + "clip_ratio/high_mean": 2.3945113980516908e-06, + "clip_ratio/low_mean": 3.1114799753595435e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350931149270764e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15354.0, + "completions/max_terminated_length": 15354.0, + "completions/mean_length": 5874.4453125, + "completions/mean_terminated_length": 5874.4453125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9577538818120956, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00509974779561162, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 305447038.0, + "reward": 0.515625, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999423027038574, + "sampling/importance_sampling_ratio/min": 0.004791648127138615, + "sampling/sampling_logp_difference/max": 5.340880870819092, + "sampling/sampling_logp_difference/mean": 0.02114470861852169, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0903062275247066e-05, + "clip_ratio/high_mean": 2.7257655688117666e-06, + "clip_ratio/low_mean": 4.784364205079328e-05, + "clip_ratio/low_min": 3.861600362142781e-06, + "clip_ratio/region_mean": 5.056940744907479e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 6197.5703125, + "completions/mean_terminated_length": 6035.88134765625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.8665244281291962, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030849494505673647, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 306258023.0, + "reward": 0.515625, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998056888580322, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.021017421036958694, + "step": 351 + }, + { + "clip_ratio/high_max": 1.4299712574938894e-05, + "clip_ratio/high_mean": 4.3520980170796975e-06, + "clip_ratio/low_mean": 6.213493452378316e-05, + "clip_ratio/low_min": 1.0056635801447555e-05, + "clip_ratio/region_mean": 6.648703174505499e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7522.578125, + "completions/mean_terminated_length": 7381.9208984375, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.8185881152749062, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002946985885500908, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 307240305.0, + "reward": 0.3125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.005127199459820986, + "sampling/sampling_logp_difference/max": 5.273195743560791, + "sampling/sampling_logp_difference/mean": 0.01965932548046112, + "step": 352 + }, + { + "clip_ratio/high_max": 1.693051035545068e-05, + "clip_ratio/high_mean": 5.08456730585749e-06, + "clip_ratio/low_mean": 4.2052345861520735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.713691282631771e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14090.0, + "completions/mean_length": 6403.2265625, + "completions/mean_terminated_length": 6163.6884765625, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "entropy": 0.8359840363264084, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031181599479168653, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 308079318.0, + "reward": 0.5, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999215602874756, + "sampling/importance_sampling_ratio/min": 6.73715621815063e-05, + "sampling/sampling_logp_difference/max": 9.605287551879883, + "sampling/sampling_logp_difference/mean": 0.01963040418922901, + "step": 353 + }, + { + "clip_ratio/high_max": 1.3988919135954347e-05, + "clip_ratio/high_mean": 3.497229783988587e-06, + "clip_ratio/low_mean": 6.722658486069122e-05, + "clip_ratio/low_min": 1.858519090092159e-05, + "clip_ratio/region_mean": 7.072381458783639e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7954.03125, + "completions/mean_terminated_length": 7751.71240234375, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.905990719795227, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002656223252415657, + "learning_rate": 1e-05, + "loss": 0.1022, + "num_tokens": 309117770.0, + "reward": 0.3828125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999536275863647, + "sampling/importance_sampling_ratio/min": 0.0003354826185386628, + "sampling/sampling_logp_difference/max": 7.999940395355225, + "sampling/sampling_logp_difference/mean": 0.020741507411003113, + "step": 354 + }, + { + "clip_ratio/high_max": 1.7610595023143105e-05, + "clip_ratio/high_mean": 4.402648755785776e-06, + "clip_ratio/low_mean": 4.337988764291367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.778253651238629e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6630.09375, + "completions/mean_terminated_length": 6315.45166015625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.870736837387085, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0060529084876179695, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 309988894.0, + "reward": 0.515625, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998822212219238, + "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05, + "sampling/sampling_logp_difference/max": 10.716434478759766, + "sampling/sampling_logp_difference/mean": 0.02060208097100258, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0448093235027045e-05, + "clip_ratio/high_mean": 2.6120233087567613e-06, + "clip_ratio/low_mean": 3.1030769946482906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364279325523967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15920.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6679.6171875, + "completions/mean_terminated_length": 6679.6171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9812518879771233, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00400698184967041, + "learning_rate": 1e-05, + "loss": 0.0605, + "num_tokens": 310864013.0, + "reward": 0.421875, + "reward_std": 0.3295465111732483, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999049305915833, + "sampling/importance_sampling_ratio/min": 0.0020593837834894657, + "sampling/sampling_logp_difference/max": 6.1853485107421875, + "sampling/sampling_logp_difference/mean": 0.02098071575164795, + "step": 356 + }, + { + "clip_ratio/high_max": 2.124982574969181e-05, + "clip_ratio/high_mean": 7.736592579021817e-06, + "clip_ratio/low_mean": 2.900951585615985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.674610888992902e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14541.0, + "completions/mean_length": 5523.796875, + "completions/mean_terminated_length": 5173.4677734375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9120645374059677, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005929585546255112, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 311589987.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998446702957153, + "sampling/importance_sampling_ratio/min": 0.0010661041596904397, + "sampling/sampling_logp_difference/max": 6.843744277954102, + "sampling/sampling_logp_difference/mean": 0.019948206841945648, + "step": 357 + }, + { + "clip_ratio/high_max": 2.4486997745043482e-05, + "clip_ratio/high_mean": 8.219769085826556e-06, + "clip_ratio/low_mean": 5.346400575945154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.168377467474784e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15401.0, + "completions/mean_length": 6361.3671875, + "completions/mean_terminated_length": 6282.44873046875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.8044678047299385, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006622390355914831, + "learning_rate": 1e-05, + "loss": 0.1023, + "num_tokens": 312424034.0, + "reward": 0.5078125, + "reward_std": 0.3724474310874939, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.0003157092141918838, + "sampling/sampling_logp_difference/max": 8.060688972473145, + "sampling/sampling_logp_difference/mean": 0.018907658755779266, + "step": 358 + }, + { + "clip_ratio/high_max": 1.0407376748844399e-05, + "clip_ratio/high_mean": 2.6018441872110998e-06, + "clip_ratio/low_mean": 5.925514369664597e-05, + "clip_ratio/low_min": 1.3324347946763737e-05, + "clip_ratio/region_mean": 6.185698703120579e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 7109.0, + "completions/mean_terminated_length": 7035.96826171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9167275875806808, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004639944992959499, + "learning_rate": 1e-05, + "loss": 0.0861, + "num_tokens": 313353346.0, + "reward": 0.4140625, + "reward_std": 0.3826971650123596, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999389052391052, + "sampling/importance_sampling_ratio/min": 0.0019070414127781987, + "sampling/sampling_logp_difference/max": 6.262202262878418, + "sampling/sampling_logp_difference/mean": 0.02155841514468193, + "step": 359 + }, + { + "clip_ratio/high_max": 3.959046694035351e-05, + "clip_ratio/high_mean": 1.0912523691786191e-05, + "clip_ratio/low_mean": 3.3944450819944905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.485697365907981e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6314.2734375, + "completions/mean_terminated_length": 6072.60009765625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.8780038207769394, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007643720600754023, + "learning_rate": 1e-05, + "loss": 0.0873, + "num_tokens": 314180717.0, + "reward": 0.4609375, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999802112579346, + "sampling/importance_sampling_ratio/min": 0.021285315975546837, + "sampling/sampling_logp_difference/max": 3.8497378826141357, + "sampling/sampling_logp_difference/mean": 0.01964358240365982, + "step": 360 + }, + { + "clip_ratio/high_max": 3.065382111344661e-05, + "clip_ratio/high_mean": 9.187473835936544e-06, + "clip_ratio/low_mean": 4.137891801292426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.056639065514901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6718.2265625, + "completions/mean_terminated_length": 6486.24853515625, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8326799497008324, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050973957404494286, + "learning_rate": 1e-05, + "loss": 0.0109, + "num_tokens": 315060842.0, + "reward": 0.5078125, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.0009130688849836588, + "sampling/sampling_logp_difference/max": 6.998699188232422, + "sampling/sampling_logp_difference/mean": 0.019501537084579468, + "step": 361 + }, + { + "clip_ratio/high_max": 8.624853762739804e-06, + "clip_ratio/high_mean": 2.156213440684951e-06, + "clip_ratio/low_mean": 1.8797969062234188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0954182048171788e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 8666.8359375, + "completions/mean_terminated_length": 7941.291015625, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.9526705741882324, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019092690199613571, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 316190325.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999814629554749, + "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05, + "sampling/sampling_logp_difference/max": 10.249995231628418, + "sampling/sampling_logp_difference/mean": 0.02051631174981594, + "step": 362 + }, + { + "clip_ratio/high_max": 2.147400391550036e-05, + "clip_ratio/high_mean": 6.434908300434472e-06, + "clip_ratio/low_mean": 3.521234066283796e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.164724816746457e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15164.0, + "completions/mean_length": 7661.8203125, + "completions/mean_terminated_length": 7002.16015625, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.8322782590985298, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019530428107827902, + "learning_rate": 1e-05, + "loss": 0.0729, + "num_tokens": 317191878.0, + "reward": 0.4609375, + "reward_std": 0.21382391452789307, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 8.546619210392237e-05, + "sampling/sampling_logp_difference/max": 9.367389678955078, + "sampling/sampling_logp_difference/mean": 0.019894573837518692, + "step": 363 + }, + { + "clip_ratio/high_max": 1.9436202364886412e-05, + "clip_ratio/high_mean": 6.089704697842535e-06, + "clip_ratio/low_mean": 4.2698405422925134e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.878810955233348e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 7024.859375, + "completions/mean_terminated_length": 6800.240234375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.794853538274765, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031784537713974714, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 318109004.0, + "reward": 0.4921875, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352693557739, + "sampling/importance_sampling_ratio/min": 0.0002962362195830792, + "sampling/sampling_logp_difference/max": 8.124353408813477, + "sampling/sampling_logp_difference/mean": 0.018519200384616852, + "step": 364 + }, + { + "clip_ratio/high_max": 4.127455667912727e-06, + "clip_ratio/high_mean": 1.0318639169781818e-06, + "clip_ratio/low_mean": 4.342453667049995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.445640047379129e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 7282.1796875, + "completions/mean_terminated_length": 6912.1865234375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.904067650437355, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005080109462141991, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 319059075.0, + "reward": 0.4140625, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062108039856, + "sampling/importance_sampling_ratio/min": 0.1194523349404335, + "sampling/sampling_logp_difference/max": 6.136754989624023, + "sampling/sampling_logp_difference/mean": 0.019978653639554977, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.608940076243016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.608940076243016e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15625.0, + "completions/mean_length": 7131.5234375, + "completions/mean_terminated_length": 6596.255859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.8849587142467499, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022667953744530678, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 319990046.0, + "reward": 0.46875, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0370909757912159, + "sampling/sampling_logp_difference/max": 3.294381618499756, + "sampling/sampling_logp_difference/mean": 0.02037571743130684, + "step": 366 + }, + { + "clip_ratio/high_max": 1.5356635913121863e-05, + "clip_ratio/high_mean": 3.839158978280466e-06, + "clip_ratio/low_mean": 3.4950805911648786e-05, + "clip_ratio/low_min": 4.876336333836662e-06, + "clip_ratio/region_mean": 3.8789965287833184e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 6655.4453125, + "completions/mean_terminated_length": 6578.84228515625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.7417122721672058, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00216497085057199, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 320860135.0, + "reward": 0.5625, + "reward_std": 0.3369230031967163, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0005190494703128934, + "sampling/sampling_logp_difference/max": 7.563511371612549, + "sampling/sampling_logp_difference/mean": 0.01771342009305954, + "step": 367 + }, + { + "clip_ratio/high_max": 1.7605634639039636e-05, + "clip_ratio/high_mean": 5.297029474604642e-06, + "clip_ratio/low_mean": 5.688933060810086e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.218636053745286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15849.0, + "completions/mean_length": 7077.1640625, + "completions/mean_terminated_length": 6619.45068359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.8749325424432755, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0028338562697172165, + "learning_rate": 1e-05, + "loss": 0.0643, + "num_tokens": 321783852.0, + "reward": 0.3828125, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998220205307007, + "sampling/importance_sampling_ratio/min": 7.83290306571871e-06, + "sampling/sampling_logp_difference/max": 11.757177352905273, + "sampling/sampling_logp_difference/mean": 0.020299233496189117, + "step": 368 + }, + { + "clip_ratio/high_max": 7.301828190975357e-06, + "clip_ratio/high_mean": 1.8254570477438392e-06, + "clip_ratio/low_mean": 5.158197632226802e-05, + "clip_ratio/low_min": 3.735804057214409e-06, + "clip_ratio/region_mean": 5.340743223314348e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6034.296875, + "completions/mean_terminated_length": 5525.294921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.80014718323946, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022897711023688316, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 322572882.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999347925186157, + "sampling/importance_sampling_ratio/min": 0.0004105660773348063, + "sampling/sampling_logp_difference/max": 7.7979736328125, + "sampling/sampling_logp_difference/mean": 0.01858348958194256, + "step": 369 + }, + { + "clip_ratio/high_max": 9.364057859784225e-06, + "clip_ratio/high_mean": 3.351393047523743e-06, + "clip_ratio/low_mean": 4.186752630630508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5218919240141986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 8172.109375, + "completions/mean_terminated_length": 7838.29248046875, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "entropy": 0.8732693120837212, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003263789461925626, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 323640904.0, + "reward": 0.2890625, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999354481697083, + "sampling/importance_sampling_ratio/min": 9.27252222027164e-06, + "sampling/sampling_logp_difference/max": 11.588455200195312, + "sampling/sampling_logp_difference/mean": 0.0208889190107584, + "step": 370 + }, + { + "clip_ratio/high_max": 2.0998899799451465e-05, + "clip_ratio/high_mean": 6.692962131182867e-06, + "clip_ratio/low_mean": 4.261424010110204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930720297124935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 7699.203125, + "completions/mean_terminated_length": 7419.04833984375, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.8296505436301231, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0042716520838439465, + "learning_rate": 1e-05, + "loss": 0.0937, + "num_tokens": 324643858.0, + "reward": 0.4921875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874234199524, + "sampling/importance_sampling_ratio/min": 0.00022192654432728887, + "sampling/sampling_logp_difference/max": 8.413164138793945, + "sampling/sampling_logp_difference/mean": 0.018926654011011124, + "step": 371 + }, + { + "clip_ratio/high_max": 7.061349151626928e-06, + "clip_ratio/high_mean": 1.765337287906732e-06, + "clip_ratio/low_mean": 4.5005243464402156e-05, + "clip_ratio/low_min": 3.861838649754645e-06, + "clip_ratio/region_mean": 4.6770580411248375e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 7450.1640625, + "completions/mean_terminated_length": 7450.1640625, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 1.0400195196270943, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033558050636202097, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 325617687.0, + "reward": 0.2578125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999459385871887, + "sampling/importance_sampling_ratio/min": 0.039920732378959656, + "sampling/sampling_logp_difference/max": 3.2208595275878906, + "sampling/sampling_logp_difference/mean": 0.02249298244714737, + "step": 372 + }, + { + "clip_ratio/high_max": 1.3147802746971138e-05, + "clip_ratio/high_mean": 3.2869506867427845e-06, + "clip_ratio/low_mean": 2.4451034505545977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7737984851228248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15342.0, + "completions/mean_length": 6799.0703125, + "completions/mean_terminated_length": 6723.5986328125, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9737623482942581, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005797459278255701, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 326508384.0, + "reward": 0.3125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321699142456, + "sampling/importance_sampling_ratio/min": 7.535634836131067e-07, + "sampling/sampling_logp_difference/max": 14.0984525680542, + "sampling/sampling_logp_difference/mean": 0.021543748676776886, + "step": 373 + }, + { + "clip_ratio/high_max": 3.3594023989280686e-06, + "clip_ratio/high_mean": 8.398505997320171e-07, + "clip_ratio/low_mean": 2.3457610382138228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4297460981870245e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 7034.3671875, + "completions/mean_terminated_length": 6654.30078125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8749603256583214, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002258980879560113, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 327426407.0, + "reward": 0.4609375, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.008719252422451973, + "sampling/sampling_logp_difference/max": 4.742221832275391, + "sampling/sampling_logp_difference/mean": 0.01997346058487892, + "step": 374 + }, + { + "clip_ratio/high_max": 2.823375348270929e-05, + "clip_ratio/high_mean": 7.058438370677322e-06, + "clip_ratio/low_mean": 4.9395109726901865e-05, + "clip_ratio/low_min": 1.636556044104509e-05, + "clip_ratio/region_mean": 5.6453548268109444e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15240.0, + "completions/mean_length": 6623.078125, + "completions/mean_terminated_length": 6388.81640625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.858784057199955, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002420129720121622, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 328292985.0, + "reward": 0.4140625, + "reward_std": 0.3077537417411804, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 0.00014900295354891568, + "sampling/sampling_logp_difference/max": 8.811544418334961, + "sampling/sampling_logp_difference/mean": 0.019645996391773224, + "step": 375 + }, + { + "clip_ratio/high_max": 1.8078507309837732e-05, + "clip_ratio/high_mean": 6.468551191574079e-06, + "clip_ratio/low_mean": 4.051302585139638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.698157727034413e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15229.0, + "completions/mean_length": 5902.4765625, + "completions/mean_terminated_length": 5564.36279296875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.904740035533905, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004107976797968149, + "learning_rate": 1e-05, + "loss": 0.0824, + "num_tokens": 329067006.0, + "reward": 0.5546875, + "reward_std": 0.3945493996143341, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05, + "sampling/sampling_logp_difference/max": 11.37439250946045, + "sampling/sampling_logp_difference/mean": 0.019582755863666534, + "step": 376 + }, + { + "clip_ratio/high_max": 2.553658168835682e-05, + "clip_ratio/high_mean": 7.276365181496658e-06, + "clip_ratio/low_mean": 1.7552573126522475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.482893796695862e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6425.6015625, + "completions/mean_terminated_length": 6267.5322265625, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.964553713798523, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003208522219210863, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 329910691.0, + "reward": 0.359375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999419450759888, + "sampling/importance_sampling_ratio/min": 0.00137569778598845, + "sampling/sampling_logp_difference/max": 6.588794231414795, + "sampling/sampling_logp_difference/mean": 0.021154657006263733, + "step": 377 + }, + { + "clip_ratio/high_max": 6.8712420215888415e-06, + "clip_ratio/high_mean": 1.7178105053972104e-06, + "clip_ratio/low_mean": 4.0991827404468495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2709637853022286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 8006.4453125, + "completions/mean_terminated_length": 7594.43408203125, + "completions/min_length": 1235.0, + "completions/min_terminated_length": 1235.0, + "entropy": 0.8980336412787437, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002898421371355653, + "learning_rate": 1e-05, + "loss": 0.0815, + "num_tokens": 330956332.0, + "reward": 0.4296875, + "reward_std": 0.20175684988498688, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 9.378339746035635e-05, + "sampling/sampling_logp_difference/max": 9.27452278137207, + "sampling/sampling_logp_difference/mean": 0.021021340042352676, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2689344689297286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2689344689297286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15484.0, + "completions/max_terminated_length": 15484.0, + "completions/mean_length": 7068.828125, + "completions/mean_terminated_length": 7068.828125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.9865007549524307, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0037063576746731997, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 331880918.0, + "reward": 0.3203125, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0001819290773710236, + "sampling/sampling_logp_difference/max": 8.611893653869629, + "sampling/sampling_logp_difference/mean": 0.02072504535317421, + "step": 379 + }, + { + "clip_ratio/high_max": 5.845633268108941e-06, + "clip_ratio/high_mean": 1.4614083170272352e-06, + "clip_ratio/low_mean": 3.207486906831036e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353627721480734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 7379.390625, + "completions/mean_terminated_length": 7236.4609375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.8977236375212669, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001972826896235347, + "learning_rate": 1e-05, + "loss": 0.0228, + "num_tokens": 332849112.0, + "reward": 0.4140625, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 2.820451663865242e-05, + "sampling/sampling_logp_difference/max": 10.476028442382812, + "sampling/sampling_logp_difference/mean": 0.019411223009228706, + "step": 380 + }, + { + "clip_ratio/high_max": 4.875385002378607e-06, + "clip_ratio/high_mean": 1.2188462505946518e-06, + "clip_ratio/low_mean": 2.3530714997832547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.47495612484272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15517.0, + "completions/mean_length": 6867.9609375, + "completions/mean_terminated_length": 6793.03125, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "entropy": 0.9244343340396881, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.006926023401319981, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333746179.0, + "reward": 0.4140625, + "reward_std": 0.1433562934398651, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.0003875594411510974, + "sampling/sampling_logp_difference/max": 7.8556413650512695, + "sampling/sampling_logp_difference/mean": 0.020311862230300903, + "step": 381 + }, + { + "clip_ratio/high_max": 1.5651628245905158e-05, + "clip_ratio/high_mean": 4.836261211949022e-06, + "clip_ratio/low_mean": 5.268017821435933e-05, + "clip_ratio/low_min": 3.950945028918795e-06, + "clip_ratio/region_mean": 5.751643902840442e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 7525.375, + "completions/mean_terminated_length": 6855.3955078125, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9207312315702438, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0047226278111338615, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 334731027.0, + "reward": 0.3359375, + "reward_std": 0.3353874683380127, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999615550041199, + "sampling/importance_sampling_ratio/min": 0.00029753465787507594, + "sampling/sampling_logp_difference/max": 8.119979858398438, + "sampling/sampling_logp_difference/mean": 0.021496692672371864, + "step": 382 + }, + { + "clip_ratio/high_max": 3.815379886873416e-05, + "clip_ratio/high_mean": 9.53844971718354e-06, + "clip_ratio/low_mean": 4.519663821156428e-05, + "clip_ratio/low_min": 2.775434040813707e-06, + "clip_ratio/region_mean": 5.473508826980833e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16251.0, + "completions/mean_length": 6841.0625, + "completions/mean_terminated_length": 6453.13818359375, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.8979457840323448, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004971448332071304, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 335631243.0, + "reward": 0.390625, + "reward_std": 0.2596156895160675, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999934196472168, + "sampling/importance_sampling_ratio/min": 9.655764188210014e-06, + "sampling/sampling_logp_difference/max": 11.547955513000488, + "sampling/sampling_logp_difference/mean": 0.020256079733371735, + "step": 383 + }, + { + "clip_ratio/high_max": 4.162365712545579e-06, + "clip_ratio/high_mean": 1.0405914281363948e-06, + "clip_ratio/low_mean": 3.1563491688757495e-05, + "clip_ratio/low_min": 3.1228139505401487e-06, + "clip_ratio/region_mean": 3.260408311689389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15060.0, + "completions/mean_length": 6919.8046875, + "completions/mean_terminated_length": 6454.35205078125, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9241961911320686, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038604787550866604, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 336537162.0, + "reward": 0.375, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998080730438232, + "sampling/importance_sampling_ratio/min": 0.0009118975722230971, + "sampling/sampling_logp_difference/max": 6.999982833862305, + "sampling/sampling_logp_difference/mean": 0.02030865103006363, + "step": 384 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 336537162, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-384/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-384/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-384/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/README.md b/dapo_milora_plus_20251201_131939/checkpoint-448/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-448/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-448/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/latest b/dapo_milora_plus_20251201_131939/checkpoint-448/latest new file mode 100644 index 0000000000000000000000000000000000000000..6c83691d1f18f1aa59c0994e76f1e0d010c88273 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/latest @@ -0,0 +1 @@ +global_step448 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-448/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-448/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-448/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3cfb0d659f3b7dfbfb24866f32ed103e61c28673 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/trainer_state.json @@ -0,0 +1,13922 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.41214351425942963, + "eval_steps": 500, + "global_step": 448, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + }, + { + "clip_ratio/high_max": 6.87833608026267e-06, + "clip_ratio/high_mean": 2.9462287329806713e-06, + "clip_ratio/low_mean": 5.435333650893881e-05, + "clip_ratio/low_min": 5.33937054569833e-06, + "clip_ratio/region_mean": 5.729956546929316e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 6448.0078125, + "completions/mean_terminated_length": 6369.771484375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9546648040413857, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004310046322643757, + "learning_rate": 1e-05, + "loss": 0.1082, + "num_tokens": 220304605.0, + "reward": 0.5703125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 0.0001234127557836473, + "sampling/sampling_logp_difference/max": 8.99997615814209, + "sampling/sampling_logp_difference/mean": 0.020253397524356842, + "step": 257 + }, + { + "clip_ratio/high_max": 6.196094091137638e-06, + "clip_ratio/high_mean": 1.5490235227844096e-06, + "clip_ratio/low_mean": 2.5416685957679874e-05, + "clip_ratio/low_min": 5.5736391004757024e-06, + "clip_ratio/region_mean": 2.696570959415112e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 7457.6484375, + "completions/mean_terminated_length": 6941.24755859375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "entropy": 0.8182889074087143, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026646999176591635, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 221281968.0, + "reward": 0.4453125, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173283576965, + "sampling/importance_sampling_ratio/min": 2.902353571698768e-06, + "sampling/sampling_logp_difference/max": 12.749988555908203, + "sampling/sampling_logp_difference/mean": 0.019208962097764015, + "step": 258 + }, + { + "clip_ratio/high_max": 1.6189535017474554e-05, + "clip_ratio/high_mean": 4.047383754368639e-06, + "clip_ratio/low_mean": 3.127787306311802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.532525670379982e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8561.109375, + "completions/mean_terminated_length": 7969.79052734375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.9581378549337387, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016026750672608614, + "learning_rate": 1e-05, + "loss": 0.0131, + "num_tokens": 222399046.0, + "reward": 0.34375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 1.653693971093162e-06, + "sampling/sampling_logp_difference/max": 13.312499046325684, + "sampling/sampling_logp_difference/mean": 0.02173236384987831, + "step": 259 + }, + { + "clip_ratio/high_max": 1.4200771602190798e-05, + "clip_ratio/high_mean": 4.3255887476334465e-06, + "clip_ratio/low_mean": 5.2955770115659107e-05, + "clip_ratio/low_min": 3.402656830076012e-06, + "clip_ratio/region_mean": 5.7281358749605715e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16239.0, + "completions/mean_length": 7152.34375, + "completions/mean_terminated_length": 7079.6533203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9052041247487068, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005460259038954973, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 223335010.0, + "reward": 0.4296875, + "reward_std": 0.3356297016143799, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966621398926, + "sampling/importance_sampling_ratio/min": 0.010161337442696095, + "sampling/sampling_logp_difference/max": 4.589165210723877, + "sampling/sampling_logp_difference/mean": 0.01986619457602501, + "step": 260 + }, + { + "clip_ratio/high_max": 1.4350314813782461e-05, + "clip_ratio/high_mean": 3.5875787034456152e-06, + "clip_ratio/low_mean": 3.81288905373367e-05, + "clip_ratio/low_min": 8.099272235995159e-06, + "clip_ratio/region_mean": 4.1716469809216505e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 6678.65625, + "completions/mean_terminated_length": 6524.603515625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.9043187350034714, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005933742038905621, + "learning_rate": 1e-05, + "loss": 0.0966, + "num_tokens": 224207006.0, + "reward": 0.484375, + "reward_std": 0.3316681981086731, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000031590461731, + "sampling/importance_sampling_ratio/min": 0.0011734943836927414, + "sampling/sampling_logp_difference/max": 6.747769355773926, + "sampling/sampling_logp_difference/mean": 0.019827336072921753, + "step": 261 + }, + { + "clip_ratio/high_max": 1.6498819377375185e-05, + "clip_ratio/high_mean": 4.124704844343796e-06, + "clip_ratio/low_mean": 3.601791678420341e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014262168539062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6999.0390625, + "completions/mean_terminated_length": 6850.07177734375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8109970837831497, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003635740838944912, + "learning_rate": 1e-05, + "loss": 0.104, + "num_tokens": 225122891.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303817749023, + "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05, + "sampling/sampling_logp_difference/max": 10.987512588500977, + "sampling/sampling_logp_difference/mean": 0.018912551924586296, + "step": 262 + }, + { + "clip_ratio/high_max": 9.527577958579059e-06, + "clip_ratio/high_mean": 2.3818944896447647e-06, + "clip_ratio/low_mean": 3.766565987461945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004755419373396e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7483.7109375, + "completions/mean_terminated_length": 7045.9912109375, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "entropy": 0.9473970532417297, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003405241761356592, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 226102462.0, + "reward": 0.4453125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002920627594, + "sampling/importance_sampling_ratio/min": 0.00525119062513113, + "sampling/sampling_logp_difference/max": 5.249300479888916, + "sampling/sampling_logp_difference/mean": 0.021076779812574387, + "step": 263 + }, + { + "clip_ratio/high_max": 1.5867321963014547e-05, + "clip_ratio/high_mean": 3.966830490753637e-06, + "clip_ratio/low_mean": 3.8259706570897833e-05, + "clip_ratio/low_min": 3.549019083948224e-06, + "clip_ratio/region_mean": 4.2226537743772496e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 7569.03125, + "completions/mean_terminated_length": 7357.47216796875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9231455475091934, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025927501264959574, + "learning_rate": 1e-05, + "loss": 0.0801, + "num_tokens": 227093562.0, + "reward": 0.3984375, + "reward_std": 0.19097033143043518, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0052477638237178326, + "sampling/sampling_logp_difference/max": 5.249953269958496, + "sampling/sampling_logp_difference/mean": 0.020578444004058838, + "step": 264 + }, + { + "clip_ratio/high_max": 1.344091060673236e-05, + "clip_ratio/high_mean": 3.36022765168309e-06, + "clip_ratio/low_mean": 4.253613235505327e-05, + "clip_ratio/low_min": 3.5579084851633525e-06, + "clip_ratio/region_mean": 4.5896360120423196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 7589.2734375, + "completions/mean_terminated_length": 7378.2001953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9265239909291267, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030512227676808834, + "learning_rate": 1e-05, + "loss": 0.04, + "num_tokens": 228086405.0, + "reward": 0.4296875, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0002165911573683843, + "sampling/sampling_logp_difference/max": 8.437499046325684, + "sampling/sampling_logp_difference/mean": 0.020208362489938736, + "step": 265 + }, + { + "clip_ratio/high_max": 1.9613525410022703e-05, + "clip_ratio/high_mean": 4.903381352505676e-06, + "clip_ratio/low_mean": 3.184792547017423e-05, + "clip_ratio/low_min": 7.29296516510658e-06, + "clip_ratio/region_mean": 3.675130722058384e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 8420.6875, + "completions/mean_terminated_length": 8096.97509765625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "entropy": 0.9572964608669281, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022430522367358208, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 229183765.0, + "reward": 0.34375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 0.00029693738906644285, + "sampling/sampling_logp_difference/max": 8.121989250183105, + "sampling/sampling_logp_difference/mean": 0.021570362150669098, + "step": 266 + }, + { + "clip_ratio/high_max": 6.728750577167375e-06, + "clip_ratio/high_mean": 1.6821876442918438e-06, + "clip_ratio/low_mean": 2.1682553096979973e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.336474062758498e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15736.0, + "completions/mean_length": 6809.765625, + "completions/mean_terminated_length": 6579.984375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.884086549282074, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004295065999031067, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 230077607.0, + "reward": 0.484375, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00754612497985363, + "sampling/sampling_logp_difference/max": 4.886721134185791, + "sampling/sampling_logp_difference/mean": 0.019895706325769424, + "step": 267 + }, + { + "clip_ratio/high_max": 2.8609347509700456e-05, + "clip_ratio/high_mean": 7.152336877425114e-06, + "clip_ratio/low_mean": 5.158006410965754e-05, + "clip_ratio/low_min": 5.210069957684027e-06, + "clip_ratio/region_mean": 5.873240070286556e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15080.0, + "completions/mean_length": 7340.6953125, + "completions/mean_terminated_length": 6973.0810546875, + "completions/min_length": 1616.0, + "completions/min_terminated_length": 1616.0, + "entropy": 0.9920620769262314, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004631794057786465, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 231035616.0, + "reward": 0.4375, + "reward_std": 0.3235401213169098, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.0002508950710762292, + "sampling/sampling_logp_difference/max": 8.290475845336914, + "sampling/sampling_logp_difference/mean": 0.020591016858816147, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.3085940774290066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3085940774290066e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14120.0, + "completions/mean_length": 6748.875, + "completions/mean_terminated_length": 6595.93701171875, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.9867061004042625, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035752104595303535, + "learning_rate": 1e-05, + "loss": 0.0455, + "num_tokens": 231920056.0, + "reward": 0.40625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.0003869794018100947, + "sampling/sampling_logp_difference/max": 7.8571391105651855, + "sampling/sampling_logp_difference/mean": 0.02061416581273079, + "step": 269 + }, + { + "clip_ratio/high_max": 1.2506750408647349e-05, + "clip_ratio/high_mean": 3.1266876021618373e-06, + "clip_ratio/low_mean": 3.10397430212106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.416643085074611e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 7260.3046875, + "completions/mean_terminated_length": 7188.46435546875, + "completions/min_length": 1384.0, + "completions/min_terminated_length": 1384.0, + "entropy": 1.0388494208455086, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036644963547587395, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 232869159.0, + "reward": 0.390625, + "reward_std": 0.2359209954738617, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999546408653259, + "sampling/importance_sampling_ratio/min": 0.0008660226594656706, + "sampling/sampling_logp_difference/max": 7.051599502563477, + "sampling/sampling_logp_difference/mean": 0.02120530977845192, + "step": 270 + }, + { + "clip_ratio/high_max": 2.704355301830219e-05, + "clip_ratio/high_mean": 6.760888254575548e-06, + "clip_ratio/low_mean": 3.1861192269388994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862208097871189e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16073.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 6354.4609375, + "completions/mean_terminated_length": 6354.4609375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "entropy": 0.8405331820249557, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004709267523139715, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 233702842.0, + "reward": 0.546875, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 0.0046309432946145535, + "sampling/sampling_logp_difference/max": 5.37499475479126, + "sampling/sampling_logp_difference/mean": 0.019126038998365402, + "step": 271 + }, + { + "clip_ratio/high_max": 9.749228638611385e-06, + "clip_ratio/high_mean": 2.437307159652846e-06, + "clip_ratio/low_mean": 3.855073941849696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.098804652130639e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6514.578125, + "completions/mean_terminated_length": 6357.9208984375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 1.0254098922014236, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003066045930609107, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 234556348.0, + "reward": 0.4375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 0.005210204049944878, + "sampling/sampling_logp_difference/max": 5.257136344909668, + "sampling/sampling_logp_difference/mean": 0.019960148259997368, + "step": 272 + }, + { + "clip_ratio/high_max": 1.0475813724042382e-05, + "clip_ratio/high_mean": 2.6189534310105955e-06, + "clip_ratio/low_mean": 3.487835761006863e-05, + "clip_ratio/low_min": 2.9392399483185727e-06, + "clip_ratio/region_mean": 3.749731081370555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 7379.5546875, + "completions/mean_terminated_length": 7236.62744140625, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 1.0397320613265038, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005132520105689764, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 235521091.0, + "reward": 0.2890625, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999256134033203, + "sampling/importance_sampling_ratio/min": 0.00016659013635944575, + "sampling/sampling_logp_difference/max": 8.699974060058594, + "sampling/sampling_logp_difference/mean": 0.021417103707790375, + "step": 273 + }, + { + "clip_ratio/high_max": 1.9904123973901733e-05, + "clip_ratio/high_mean": 5.776861314643611e-06, + "clip_ratio/low_mean": 2.6659268655748747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2436129686175263e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 7837.1640625, + "completions/mean_terminated_length": 7632.04052734375, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "entropy": 0.8400963917374611, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028969801496714354, + "learning_rate": 1e-05, + "loss": 0.0143, + "num_tokens": 236544160.0, + "reward": 0.3828125, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887943267822, + "sampling/importance_sampling_ratio/min": 2.883308241052873e-07, + "sampling/sampling_logp_difference/max": 15.059157371520996, + "sampling/sampling_logp_difference/mean": 0.019267702475190163, + "step": 274 + }, + { + "clip_ratio/high_max": 8.562770290154731e-06, + "clip_ratio/high_mean": 2.1406925725386827e-06, + "clip_ratio/low_mean": 4.060094340729847e-05, + "clip_ratio/low_min": 3.8700886761944275e-06, + "clip_ratio/region_mean": 4.2741635979837156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15350.0, + "completions/mean_length": 6696.3515625, + "completions/mean_terminated_length": 6542.57958984375, + "completions/min_length": 1239.0, + "completions/min_terminated_length": 1239.0, + "entropy": 0.8495818004012108, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003412836929783225, + "learning_rate": 1e-05, + "loss": 0.0803, + "num_tokens": 237423101.0, + "reward": 0.515625, + "reward_std": 0.37981897592544556, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.012152798473834991, + "sampling/sampling_logp_difference/max": 4.410195827484131, + "sampling/sampling_logp_difference/mean": 0.018458625301718712, + "step": 275 + }, + { + "clip_ratio/high_max": 1.1463653436294408e-05, + "clip_ratio/high_mean": 3.646129641765583e-06, + "clip_ratio/low_mean": 6.144847083078275e-05, + "clip_ratio/low_min": 1.110105540647055e-05, + "clip_ratio/region_mean": 6.509460160941671e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15666.0, + "completions/mean_length": 7700.3671875, + "completions/mean_terminated_length": 7121.45849609375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.8258870914578438, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024443145375698805, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 238429956.0, + "reward": 0.375, + "reward_std": 0.2872493863105774, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999113082885742, + "sampling/importance_sampling_ratio/min": 0.00026112530031241477, + "sampling/sampling_logp_difference/max": 8.250510215759277, + "sampling/sampling_logp_difference/mean": 0.019427984952926636, + "step": 276 + }, + { + "clip_ratio/high_max": 4.218127742205979e-06, + "clip_ratio/high_mean": 1.0545319355514948e-06, + "clip_ratio/low_mean": 1.7289162997258245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.834369493280974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16112.0, + "completions/mean_length": 6255.21875, + "completions/mean_terminated_length": 6094.44482421875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.8179014846682549, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022747826296836138, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 239250160.0, + "reward": 0.5234375, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.0002633975527714938, + "sampling/sampling_logp_difference/max": 8.241846084594727, + "sampling/sampling_logp_difference/mean": 0.018723051995038986, + "step": 277 + }, + { + "clip_ratio/high_max": 1.698448841125355e-05, + "clip_ratio/high_mean": 5.369374321162468e-06, + "clip_ratio/low_mean": 6.14647315160255e-05, + "clip_ratio/low_min": 5.043576493335422e-06, + "clip_ratio/region_mean": 6.683410583718796e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15321.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6914.9609375, + "completions/mean_terminated_length": 6914.9609375, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9700981751084328, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005685295443981886, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 240156211.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05, + "sampling/sampling_logp_difference/max": 9.997581481933594, + "sampling/sampling_logp_difference/mean": 0.021195171400904655, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9186837764427764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9186837764427764e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15469.0, + "completions/mean_length": 5227.53125, + "completions/mean_terminated_length": 5139.68505859375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9116031974554062, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003880272386595607, + "learning_rate": 1e-05, + "loss": 0.1246, + "num_tokens": 240845295.0, + "reward": 0.6328125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000362396240234, + "sampling/importance_sampling_ratio/min": 0.00012422871077433228, + "sampling/sampling_logp_difference/max": 8.993386268615723, + "sampling/sampling_logp_difference/mean": 0.018801718950271606, + "step": 279 + }, + { + "clip_ratio/high_max": 2.5015486926349695e-05, + "clip_ratio/high_mean": 8.084949570275057e-06, + "clip_ratio/low_mean": 5.524710468307603e-05, + "clip_ratio/low_min": 3.776891389861703e-06, + "clip_ratio/region_mean": 6.333205465125502e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 8065.4765625, + "completions/mean_terminated_length": 7510.90869140625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.7446574792265892, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028986844699829817, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 241895676.0, + "reward": 0.4921875, + "reward_std": 0.3474721610546112, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.0017039099475368857, + "sampling/sampling_logp_difference/max": 6.3748297691345215, + "sampling/sampling_logp_difference/mean": 0.01853121444582939, + "step": 280 + }, + { + "clip_ratio/high_max": 9.486341014053323e-06, + "clip_ratio/high_mean": 2.371585253513331e-06, + "clip_ratio/low_mean": 2.896106741445692e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133265261112683e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15534.0, + "completions/max_terminated_length": 15534.0, + "completions/mean_length": 6127.359375, + "completions/mean_terminated_length": 6127.359375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.8569132760167122, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003845847910270095, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 242698258.0, + "reward": 0.53125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000942945480347, + "sampling/importance_sampling_ratio/min": 0.00043231461313553154, + "sampling/sampling_logp_difference/max": 7.746356964111328, + "sampling/sampling_logp_difference/mean": 0.01856958493590355, + "step": 281 + }, + { + "clip_ratio/high_max": 2.9848330086679198e-05, + "clip_ratio/high_mean": 7.4620825216697995e-06, + "clip_ratio/low_mean": 4.3558867673709756e-05, + "clip_ratio/low_min": 4.417741820361698e-06, + "clip_ratio/region_mean": 5.1020949285884853e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15192.0, + "completions/mean_length": 6600.1484375, + "completions/mean_terminated_length": 6365.33642578125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.78924310952425, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003953634761273861, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 243560957.0, + "reward": 0.5546875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.0006525487406179309, + "sampling/sampling_logp_difference/max": 7.334624767303467, + "sampling/sampling_logp_difference/mean": 0.018097909167408943, + "step": 282 + }, + { + "clip_ratio/high_max": 6.635561703660642e-06, + "clip_ratio/high_mean": 1.6588904259151604e-06, + "clip_ratio/low_mean": 2.737523408313791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9034124281679397e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7852.171875, + "completions/mean_terminated_length": 7852.171875, + "completions/min_length": 1276.0, + "completions/min_terminated_length": 1276.0, + "entropy": 1.0598893761634827, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00360781978815794, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 244585923.0, + "reward": 0.3125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05, + "sampling/sampling_logp_difference/max": 10.076086044311523, + "sampling/sampling_logp_difference/mean": 0.022330068051815033, + "step": 283 + }, + { + "clip_ratio/high_max": 3.1540168947685743e-06, + "clip_ratio/high_mean": 7.885042236921436e-07, + "clip_ratio/low_mean": 4.7973388973332476e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.876189268543385e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7972.2265625, + "completions/mean_terminated_length": 7700.87890625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.933217465877533, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0027661293279379606, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 245628064.0, + "reward": 0.28125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05, + "sampling/sampling_logp_difference/max": 10.366576194763184, + "sampling/sampling_logp_difference/mean": 0.021125148981809616, + "step": 284 + }, + { + "clip_ratio/high_max": 1.2965969062861404e-05, + "clip_ratio/high_mean": 3.241492265715351e-06, + "clip_ratio/low_mean": 4.6317693090713874e-05, + "clip_ratio/low_min": 3.820877282123547e-06, + "clip_ratio/region_mean": 4.955918507221213e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7135.6953125, + "completions/mean_terminated_length": 6913.736328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.7786942347884178, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005680318456143141, + "learning_rate": 1e-05, + "loss": 0.0786, + "num_tokens": 246561329.0, + "reward": 0.4296875, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462366104126, + "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05, + "sampling/sampling_logp_difference/max": 9.737424850463867, + "sampling/sampling_logp_difference/mean": 0.018504241481423378, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.22437145175536e-05, + "clip_ratio/low_min": 1.4025082009538892e-05, + "clip_ratio/region_mean": 4.22437145175536e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6704.046875, + "completions/mean_terminated_length": 6627.82666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 1.0435140281915665, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026402862276881933, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 247437415.0, + "reward": 0.3828125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.0007800163584761322, + "sampling/sampling_logp_difference/max": 7.156195640563965, + "sampling/sampling_logp_difference/mean": 0.02134273201227188, + "step": 286 + }, + { + "clip_ratio/high_max": 2.223430897174694e-05, + "clip_ratio/high_mean": 6.8746438159905665e-06, + "clip_ratio/low_mean": 4.7084630978133646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3959275192028144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 5892.5078125, + "completions/mean_terminated_length": 5725.9765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.8004944771528244, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003993614576756954, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 248211112.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0024652592837810516, + "sampling/sampling_logp_difference/max": 6.005458354949951, + "sampling/sampling_logp_difference/mean": 0.01924925297498703, + "step": 287 + }, + { + "clip_ratio/high_max": 2.1833082200828358e-05, + "clip_ratio/high_mean": 5.458270550207089e-06, + "clip_ratio/low_mean": 3.415995615796419e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961822596920683e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 7812.140625, + "completions/mean_terminated_length": 7316.24755859375, + "completions/min_length": 1515.0, + "completions/min_terminated_length": 1515.0, + "entropy": 0.8841542899608612, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001573400106281042, + "learning_rate": 1e-05, + "loss": 0.0823, + "num_tokens": 249228106.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 0.001001527882181108, + "sampling/sampling_logp_difference/max": 6.906228542327881, + "sampling/sampling_logp_difference/mean": 0.01956877112388611, + "step": 288 + }, + { + "clip_ratio/high_max": 1.014439021673752e-05, + "clip_ratio/high_mean": 2.53609755418438e-06, + "clip_ratio/low_mean": 3.068193461785995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.321803217204433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 6372.953125, + "completions/mean_terminated_length": 6132.6884765625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.8228401988744736, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021125099156051874, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 250063284.0, + "reward": 0.5, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05, + "sampling/sampling_logp_difference/max": 9.937475204467773, + "sampling/sampling_logp_difference/mean": 0.01943521574139595, + "step": 289 + }, + { + "clip_ratio/high_max": 7.023906164249638e-06, + "clip_ratio/high_mean": 1.7559765410624095e-06, + "clip_ratio/low_mean": 2.526416994896863e-05, + "clip_ratio/low_min": 6.7760895490209805e-06, + "clip_ratio/region_mean": 2.7020146660561295e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16270.0, + "completions/mean_length": 7817.8671875, + "completions/mean_terminated_length": 7396.58154296875, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.9454319775104523, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022315154783427715, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 251085123.0, + "reward": 0.40625, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06, + "sampling/sampling_logp_difference/max": 12.760490417480469, + "sampling/sampling_logp_difference/mean": 0.021764669567346573, + "step": 290 + }, + { + "clip_ratio/high_max": 1.4797966287005693e-05, + "clip_ratio/high_mean": 3.699491571751423e-06, + "clip_ratio/low_mean": 4.36271948274225e-05, + "clip_ratio/low_min": 3.6957101201551268e-06, + "clip_ratio/region_mean": 4.732668639917392e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 7168.4921875, + "completions/mean_terminated_length": 6635.36328125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8433891162276268, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 252020906.0, + "reward": 0.5546875, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.0003851866349577904, + "sampling/sampling_logp_difference/max": 7.861782550811768, + "sampling/sampling_logp_difference/mean": 0.01929781585931778, + "step": 291 + }, + { + "clip_ratio/high_max": 1.996871560550062e-05, + "clip_ratio/high_mean": 6.089093403716106e-06, + "clip_ratio/low_mean": 4.2792244585143635e-05, + "clip_ratio/low_min": 1.0337215371691855e-05, + "clip_ratio/region_mean": 4.8881338216233416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7322.5078125, + "completions/mean_terminated_length": 6876.8603515625, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 0.9157031401991844, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036942458245903254, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 252977435.0, + "reward": 0.3359375, + "reward_std": 0.24275577068328857, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.00029605376766994596, + "sampling/sampling_logp_difference/max": 8.124969482421875, + "sampling/sampling_logp_difference/mean": 0.0205365102738142, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.631919460327481e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.631919460327481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16078.0, + "completions/mean_length": 7025.484375, + "completions/mean_terminated_length": 6723.5966796875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 1.1329731941223145, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034127074759453535, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 253896161.0, + "reward": 0.25, + "reward_std": 0.27722424268722534, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0005197672289796174, + "sampling/sampling_logp_difference/max": 7.562129497528076, + "sampling/sampling_logp_difference/mean": 0.023741140961647034, + "step": 293 + }, + { + "clip_ratio/high_max": 4.368643658381188e-06, + "clip_ratio/high_mean": 1.092160914595297e-06, + "clip_ratio/low_mean": 2.4661783299961826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5753944555617636e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13776.0, + "completions/mean_length": 5996.1796875, + "completions/mean_terminated_length": 5661.08837890625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8773328885436058, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003959407564252615, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 254690264.0, + "reward": 0.53125, + "reward_std": 0.26645541191101074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07, + "sampling/sampling_logp_difference/max": 15.73043155670166, + "sampling/sampling_logp_difference/mean": 0.018407585099339485, + "step": 294 + }, + { + "clip_ratio/high_max": 1.616483677935321e-05, + "clip_ratio/high_mean": 4.041209194838302e-06, + "clip_ratio/low_mean": 3.736187466074625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140308453770558e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7165.328125, + "completions/mean_terminated_length": 6867.951171875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.9502597972750664, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030910037457942963, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 255626394.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000731945037842, + "sampling/importance_sampling_ratio/min": 0.00022311302018351853, + "sampling/sampling_logp_difference/max": 8.407832145690918, + "sampling/sampling_logp_difference/mean": 0.020668907091021538, + "step": 295 + }, + { + "clip_ratio/high_max": 1.1702686606440693e-05, + "clip_ratio/high_mean": 2.9256716516101733e-06, + "clip_ratio/low_mean": 5.5247357522603124e-05, + "clip_ratio/low_min": 3.6811261452385224e-06, + "clip_ratio/region_mean": 5.8173028264718596e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15375.0, + "completions/mean_length": 8001.9296875, + "completions/mean_terminated_length": 7661.34912109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8591345250606537, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037233952898532152, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 256673457.0, + "reward": 0.421875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.0021876997780054808, + "sampling/sampling_logp_difference/max": 6.124904632568359, + "sampling/sampling_logp_difference/mean": 0.020540472120046616, + "step": 296 + }, + { + "clip_ratio/high_max": 3.721341136042611e-05, + "clip_ratio/high_mean": 1.2759249216287571e-05, + "clip_ratio/low_mean": 3.570647322703735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.846572301175911e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 6924.84375, + "completions/mean_terminated_length": 6697.82421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.7969356626272202, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006054217461496592, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 257578501.0, + "reward": 0.5078125, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.007889713160693645, + "sampling/sampling_logp_difference/max": 4.842195510864258, + "sampling/sampling_logp_difference/mean": 0.019306108355522156, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0211543894911301e-05, + "clip_ratio/high_mean": 2.5528859737278253e-06, + "clip_ratio/low_mean": 5.2388056587915344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4940942732173426e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14439.0, + "completions/mean_length": 6203.03125, + "completions/mean_terminated_length": 5958.6884765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.8734413683414459, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004903806839138269, + "learning_rate": 1e-05, + "loss": 0.0689, + "num_tokens": 258392625.0, + "reward": 0.4453125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 0.00020370795391499996, + "sampling/sampling_logp_difference/max": 8.498823165893555, + "sampling/sampling_logp_difference/mean": 0.01909301057457924, + "step": 298 + }, + { + "clip_ratio/high_max": 1.5135058674786706e-05, + "clip_ratio/high_mean": 4.64845766146027e-06, + "clip_ratio/low_mean": 4.373456977191381e-05, + "clip_ratio/low_min": 3.670856358439778e-06, + "clip_ratio/region_mean": 4.8383026296505705e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 7982.5390625, + "completions/mean_terminated_length": 7641.01611328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0091779381036758, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033637424930930138, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 259435270.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999765753746033, + "sampling/importance_sampling_ratio/min": 0.0016514655435457826, + "sampling/sampling_logp_difference/max": 6.406092166900635, + "sampling/sampling_logp_difference/mean": 0.02182736061513424, + "step": 299 + }, + { + "clip_ratio/high_max": 2.3964702677403693e-05, + "clip_ratio/high_mean": 5.991175669350923e-06, + "clip_ratio/low_mean": 5.2442986770984135e-05, + "clip_ratio/low_min": 8.75736759553547e-06, + "clip_ratio/region_mean": 5.843416238349164e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6915.3125, + "completions/mean_terminated_length": 6688.064453125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.7964543774724007, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052203768864274025, + "learning_rate": 1e-05, + "loss": 0.144, + "num_tokens": 260337614.0, + "reward": 0.46875, + "reward_std": 0.37928223609924316, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 7.032832218101248e-05, + "sampling/sampling_logp_difference/max": 9.562335968017578, + "sampling/sampling_logp_difference/mean": 0.017896221950650215, + "step": 300 + }, + { + "clip_ratio/high_max": 4.458271632756805e-05, + "clip_ratio/high_mean": 1.1145679081892013e-05, + "clip_ratio/low_mean": 6.243192206056847e-05, + "clip_ratio/low_min": 1.2397775662975619e-05, + "clip_ratio/region_mean": 7.357759886872373e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7029.4375, + "completions/mean_terminated_length": 6880.95263671875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.8605096861720085, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005570738110691309, + "learning_rate": 1e-05, + "loss": 0.0984, + "num_tokens": 261254070.0, + "reward": 0.4765625, + "reward_std": 0.3327290117740631, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999494552612305, + "sampling/importance_sampling_ratio/min": 0.0009070249507203698, + "sampling/sampling_logp_difference/max": 7.005340576171875, + "sampling/sampling_logp_difference/mean": 0.01905740052461624, + "step": 301 + }, + { + "clip_ratio/high_max": 3.390461233720998e-05, + "clip_ratio/high_mean": 1.1191766247975465e-05, + "clip_ratio/low_mean": 7.46641262594494e-05, + "clip_ratio/low_min": 5.041745680500753e-06, + "clip_ratio/region_mean": 8.585589102949598e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5858.84375, + "completions/mean_terminated_length": 5606.240234375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8430554121732712, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004496110137552023, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 262024906.0, + "reward": 0.4453125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294877052307, + "sampling/importance_sampling_ratio/min": 0.00040469475788995624, + "sampling/sampling_logp_difference/max": 7.812377452850342, + "sampling/sampling_logp_difference/mean": 0.019225869327783585, + "step": 302 + }, + { + "clip_ratio/high_max": 3.2563955301156966e-06, + "clip_ratio/high_mean": 8.140988825289242e-07, + "clip_ratio/low_mean": 3.7080020149460324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.789411886145899e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15976.0, + "completions/mean_length": 8337.328125, + "completions/mean_terminated_length": 7728.7568359375, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.901745393872261, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00348713924176991, + "learning_rate": 1e-05, + "loss": -0.0002, + "num_tokens": 263110844.0, + "reward": 0.296875, + "reward_std": 0.20805485546588898, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.0022652465850114822, + "sampling/sampling_logp_difference/max": 6.090071678161621, + "sampling/sampling_logp_difference/mean": 0.02157524600625038, + "step": 303 + }, + { + "clip_ratio/high_max": 2.3739744847262045e-05, + "clip_ratio/high_mean": 5.934936211815511e-06, + "clip_ratio/low_mean": 2.823553325015382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.417046866616147e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7084.7265625, + "completions/mean_terminated_length": 6381.42041015625, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8265534415841103, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003980033565312624, + "learning_rate": 1e-05, + "loss": 0.0551, + "num_tokens": 264036169.0, + "reward": 0.3984375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673366546631, + "sampling/importance_sampling_ratio/min": 0.00012345099821686745, + "sampling/sampling_logp_difference/max": 8.999666213989258, + "sampling/sampling_logp_difference/mean": 0.018782664090394974, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1745505617000163e-05, + "clip_ratio/high_mean": 3.771558226617344e-06, + "clip_ratio/low_mean": 6.913120819262986e-05, + "clip_ratio/low_min": 2.494283216947224e-05, + "clip_ratio/region_mean": 7.290276607818669e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6543.796875, + "completions/mean_terminated_length": 6543.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8899869695305824, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.006467343773692846, + "learning_rate": 1e-05, + "loss": 0.1139, + "num_tokens": 264892767.0, + "reward": 0.484375, + "reward_std": 0.3934885561466217, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000489950180054, + "sampling/importance_sampling_ratio/min": 9.891482477542013e-05, + "sampling/sampling_logp_difference/max": 9.221251487731934, + "sampling/sampling_logp_difference/mean": 0.02032080665230751, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.395576979732141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.395576979732141e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16307.0, + "completions/mean_length": 8483.390625, + "completions/mean_terminated_length": 7813.84765625, + "completions/min_length": 1342.0, + "completions/min_terminated_length": 1342.0, + "entropy": 0.9621479511260986, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003174177836626768, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 265995697.0, + "reward": 0.3359375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.0005628522485494614, + "sampling/sampling_logp_difference/max": 7.4824934005737305, + "sampling/sampling_logp_difference/mean": 0.02145479805767536, + "step": 306 + }, + { + "clip_ratio/high_max": 1.2596524811669951e-05, + "clip_ratio/high_mean": 3.149131202917488e-06, + "clip_ratio/low_mean": 3.7911659774181317e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.106079018129094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14985.0, + "completions/mean_length": 7184.578125, + "completions/mean_terminated_length": 6963.79248046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9993807673454285, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003356153378263116, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 266937707.0, + "reward": 0.3828125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.0017036627978086472, + "sampling/sampling_logp_difference/max": 6.374974727630615, + "sampling/sampling_logp_difference/mean": 0.02204768732190132, + "step": 307 + }, + { + "clip_ratio/high_max": 1.9245163684900035e-05, + "clip_ratio/high_mean": 4.811290921225009e-06, + "clip_ratio/low_mean": 4.8845648166206956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.365693925796222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16216.0, + "completions/mean_length": 7029.2265625, + "completions/mean_terminated_length": 6727.45947265625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.9139953926205635, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006375293247401714, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 267853880.0, + "reward": 0.4765625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.010649868287146091, + "sampling/sampling_logp_difference/max": 4.542207717895508, + "sampling/sampling_logp_difference/mean": 0.020365029573440552, + "step": 308 + }, + { + "clip_ratio/high_max": 4.812504812434781e-06, + "clip_ratio/high_mean": 1.2031262031086953e-06, + "clip_ratio/low_mean": 2.5999243803198624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.720237000630732e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6188.0078125, + "completions/mean_terminated_length": 5943.30419921875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.7640773430466652, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003697809297591448, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 268665721.0, + "reward": 0.5078125, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372363090515, + "sampling/importance_sampling_ratio/min": 0.02927250787615776, + "sampling/sampling_logp_difference/max": 3.531106472015381, + "sampling/sampling_logp_difference/mean": 0.016581017524003983, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1358927824621787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1358927824621787e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 8128.21875, + "completions/mean_terminated_length": 7861.90283203125, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.8218234181404114, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002286596456542611, + "learning_rate": 1e-05, + "loss": 0.0763, + "num_tokens": 269726181.0, + "reward": 0.375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798536300659, + "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06, + "sampling/sampling_logp_difference/max": 12.90043830871582, + "sampling/sampling_logp_difference/mean": 0.019403984770178795, + "step": 310 + }, + { + "clip_ratio/high_max": 1.4808477317274082e-05, + "clip_ratio/high_mean": 3.7021193293185206e-06, + "clip_ratio/low_mean": 3.0363167581981543e-05, + "clip_ratio/low_min": 6.364238288369961e-06, + "clip_ratio/region_mean": 3.4065286854456645e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 5673.3359375, + "completions/mean_terminated_length": 5503.32568359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.9275510385632515, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00485506234690547, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 270470616.0, + "reward": 0.4921875, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.0009123464697040617, + "sampling/sampling_logp_difference/max": 6.999490737915039, + "sampling/sampling_logp_difference/mean": 0.01881871558725834, + "step": 311 + }, + { + "clip_ratio/high_max": 1.1274602456978755e-05, + "clip_ratio/high_mean": 3.6739949109687586e-06, + "clip_ratio/low_mean": 3.968570712231667e-05, + "clip_ratio/low_min": 3.4213767321489286e-06, + "clip_ratio/region_mean": 4.335970191959859e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 6944.8984375, + "completions/mean_terminated_length": 6795.07177734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9335741624236107, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005874342750757933, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 271377723.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000594854354858, + "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05, + "sampling/sampling_logp_difference/max": 10.049861907958984, + "sampling/sampling_logp_difference/mean": 0.020590776577591896, + "step": 312 + }, + { + "clip_ratio/high_max": 1.264126694877632e-05, + "clip_ratio/high_mean": 3.16031673719408e-06, + "clip_ratio/low_mean": 3.206376845810155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.522408474054828e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7705.625, + "completions/mean_terminated_length": 7278.8193359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.8491624072194099, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001684082904830575, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 272384891.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 6.605865200981498e-05, + "sampling/sampling_logp_difference/max": 9.624967575073242, + "sampling/sampling_logp_difference/mean": 0.020136822015047073, + "step": 313 + }, + { + "clip_ratio/high_max": 9.772357770998497e-06, + "clip_ratio/high_mean": 2.443089442749624e-06, + "clip_ratio/low_mean": 3.8573590472879005e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101667946088128e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6611.1484375, + "completions/mean_terminated_length": 6534.19677734375, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "entropy": 0.8867302760481834, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003692191792652011, + "learning_rate": 1e-05, + "loss": 0.1233, + "num_tokens": 273251630.0, + "reward": 0.3984375, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.0031062732450664043, + "sampling/sampling_logp_difference/max": 5.774331569671631, + "sampling/sampling_logp_difference/mean": 0.019237037748098373, + "step": 314 + }, + { + "clip_ratio/high_max": 3.0103737344688852e-05, + "clip_ratio/high_mean": 9.664363972206047e-06, + "clip_ratio/low_mean": 1.7575501146893657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.723986426644842e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15786.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 6770.46875, + "completions/mean_terminated_length": 6770.46875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.8252957463264465, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004167635925114155, + "learning_rate": 1e-05, + "loss": -0.0072, + "num_tokens": 274146482.0, + "reward": 0.5703125, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.00010247006866848096, + "sampling/sampling_logp_difference/max": 9.18593978881836, + "sampling/sampling_logp_difference/mean": 0.019684650003910065, + "step": 315 + }, + { + "clip_ratio/high_max": 6.529460733872838e-06, + "clip_ratio/high_mean": 1.6323651834682096e-06, + "clip_ratio/low_mean": 3.877351048231503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.040587566578324e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15827.0, + "completions/mean_length": 8210.859375, + "completions/mean_terminated_length": 7365.36181640625, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.8118235394358635, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030363225378096104, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 275214040.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998943209648132, + "sampling/importance_sampling_ratio/min": 0.002854935359209776, + "sampling/sampling_logp_difference/max": 5.858705997467041, + "sampling/sampling_logp_difference/mean": 0.019275270402431488, + "step": 316 + }, + { + "clip_ratio/high_max": 7.0800629146106075e-06, + "clip_ratio/high_mean": 1.7700157286526519e-06, + "clip_ratio/low_mean": 2.3981688286767167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5751703674359305e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14900.0, + "completions/mean_length": 7072.8828125, + "completions/mean_terminated_length": 6849.41650390625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8018335327506065, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004777858033776283, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 276138049.0, + "reward": 0.453125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 0.0028502768836915493, + "sampling/sampling_logp_difference/max": 5.860339164733887, + "sampling/sampling_logp_difference/mean": 0.01849908009171486, + "step": 317 + }, + { + "clip_ratio/high_max": 2.259368602608447e-05, + "clip_ratio/high_mean": 5.648421506521117e-06, + "clip_ratio/low_mean": 4.28424866640853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.849090737479855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14447.0, + "completions/mean_length": 5889.8359375, + "completions/mean_terminated_length": 5723.26220703125, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.7976400703191757, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030593445990234613, + "learning_rate": 1e-05, + "loss": 0.1331, + "num_tokens": 276910124.0, + "reward": 0.5859375, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.000139843366923742, + "sampling/sampling_logp_difference/max": 8.874987602233887, + "sampling/sampling_logp_difference/mean": 0.01834402233362198, + "step": 318 + }, + { + "clip_ratio/high_max": 1.4654247024736833e-05, + "clip_ratio/high_mean": 3.663561756184208e-06, + "clip_ratio/low_mean": 2.377464920755301e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7438210736363544e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 7144.265625, + "completions/mean_terminated_length": 6689.85205078125, + "completions/min_length": 1200.0, + "completions/min_terminated_length": 1200.0, + "entropy": 0.8309404999017715, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004245694726705551, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 277843542.0, + "reward": 0.4453125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998534321784973, + "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05, + "sampling/sampling_logp_difference/max": 11.499897956848145, + "sampling/sampling_logp_difference/mean": 0.01875344291329384, + "step": 319 + }, + { + "clip_ratio/high_max": 6.252500952541595e-06, + "clip_ratio/high_mean": 2.241558604509919e-06, + "clip_ratio/low_mean": 4.735765514851664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9599213525652885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15722.0, + "completions/mean_length": 6779.5234375, + "completions/mean_terminated_length": 6703.8974609375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.9584890529513359, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035574575886130333, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 278730129.0, + "reward": 0.3984375, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.005792221520096064, + "sampling/sampling_logp_difference/max": 5.151239395141602, + "sampling/sampling_logp_difference/mean": 0.02137477695941925, + "step": 320 + }, + { + "clip_ratio/high_max": 3.2948471016425174e-05, + "clip_ratio/high_mean": 9.518853403278627e-06, + "clip_ratio/low_mean": 2.195712454522436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.14759782895635e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15892.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 5582.9765625, + "completions/mean_terminated_length": 5582.9765625, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8629376217722893, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037982752546668053, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 279462542.0, + "reward": 0.5546875, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999780058860779, + "sampling/importance_sampling_ratio/min": 0.0021874974481761456, + "sampling/sampling_logp_difference/max": 6.124997138977051, + "sampling/sampling_logp_difference/mean": 0.01906203106045723, + "step": 321 + }, + { + "clip_ratio/high_max": 1.1029473625967512e-05, + "clip_ratio/high_mean": 2.757368406491878e-06, + "clip_ratio/low_mean": 5.367386921761863e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6431237737797346e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 6942.2578125, + "completions/mean_terminated_length": 6477.90966796875, + "completions/min_length": 1156.0, + "completions/min_terminated_length": 1156.0, + "entropy": 0.8147861957550049, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027678858023136854, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 280370207.0, + "reward": 0.4375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998471736907959, + "sampling/importance_sampling_ratio/min": 0.00023058800434228033, + "sampling/sampling_logp_difference/max": 8.3748779296875, + "sampling/sampling_logp_difference/mean": 0.01940828748047352, + "step": 322 + }, + { + "clip_ratio/high_max": 2.6367894406575942e-05, + "clip_ratio/high_mean": 8.765707434577052e-06, + "clip_ratio/low_mean": 3.232976985145797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.109547796815605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6242.53125, + "completions/mean_terminated_length": 5915.38671875, + "completions/min_length": 1220.0, + "completions/min_terminated_length": 1220.0, + "entropy": 0.878915011882782, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00577945914119482, + "learning_rate": 1e-05, + "loss": 0.0839, + "num_tokens": 281189491.0, + "reward": 0.515625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 9.611724817659706e-05, + "sampling/sampling_logp_difference/max": 9.2499418258667, + "sampling/sampling_logp_difference/mean": 0.01948760263621807, + "step": 323 + }, + { + "clip_ratio/high_max": 3.50839609382092e-05, + "clip_ratio/high_mean": 1.1664920634757436e-05, + "clip_ratio/low_mean": 1.833109013205103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9996010880495305e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 7004.015625, + "completions/mean_terminated_length": 6622.71533203125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.7964659407734871, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014128695474937558, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 282103997.0, + "reward": 0.4140625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.0024504722096025944, + "sampling/sampling_logp_difference/max": 6.011474609375, + "sampling/sampling_logp_difference/mean": 0.019019678235054016, + "step": 324 + }, + { + "clip_ratio/high_max": 1.832260545597819e-05, + "clip_ratio/high_mean": 4.580651363994548e-06, + "clip_ratio/low_mean": 5.309064226821647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.767129368905444e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7822.6953125, + "completions/mean_terminated_length": 7546.52392578125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.8571138679981232, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002476039342582226, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 283122382.0, + "reward": 0.4609375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.0009774373611435294, + "sampling/sampling_logp_difference/max": 6.930576324462891, + "sampling/sampling_logp_difference/mean": 0.020557202398777008, + "step": 325 + }, + { + "clip_ratio/high_max": 5.738419986300869e-06, + "clip_ratio/high_mean": 1.4346049965752172e-06, + "clip_ratio/low_mean": 4.19679121819172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3402517292179255e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7738.8984375, + "completions/mean_terminated_length": 6844.57763671875, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "entropy": 0.7839021533727646, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005309853237122297, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 284130081.0, + "reward": 0.5234375, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998971223831177, + "sampling/importance_sampling_ratio/min": 0.0001319014554610476, + "sampling/sampling_logp_difference/max": 8.933455467224121, + "sampling/sampling_logp_difference/mean": 0.01873316988348961, + "step": 326 + }, + { + "clip_ratio/high_max": 1.007085802484653e-05, + "clip_ratio/high_mean": 2.5177145062116324e-06, + "clip_ratio/low_mean": 4.043528815600439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.295300277590286e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15952.0, + "completions/mean_length": 7102.2421875, + "completions/mean_terminated_length": 6954.9130859375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8530801385641098, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228116944432259, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 285058720.0, + "reward": 0.5078125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00012956927821505815, + "sampling/sampling_logp_difference/max": 8.951294898986816, + "sampling/sampling_logp_difference/mean": 0.019325006753206253, + "step": 327 + }, + { + "clip_ratio/high_max": 4.06874551117653e-06, + "clip_ratio/high_mean": 1.0171863777941326e-06, + "clip_ratio/low_mean": 3.661125703047219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.762844340826632e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15594.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6583.4765625, + "completions/mean_terminated_length": 6583.4765625, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.021921381354332, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004967439454048872, + "learning_rate": 1e-05, + "loss": 0.0374, + "num_tokens": 285919765.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.016675354912877083, + "sampling/sampling_logp_difference/max": 4.093823432922363, + "sampling/sampling_logp_difference/mean": 0.021393200382590294, + "step": 328 + }, + { + "clip_ratio/high_max": 1.2215251445013564e-05, + "clip_ratio/high_mean": 3.053812861253391e-06, + "clip_ratio/low_mean": 4.05305947879242e-05, + "clip_ratio/low_min": 4.215567059873138e-06, + "clip_ratio/region_mean": 4.358440742180392e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16299.0, + "completions/mean_length": 7770.5859375, + "completions/mean_terminated_length": 7346.97509765625, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "entropy": 1.0466903448104858, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004189736675471067, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 286935512.0, + "reward": 0.3828125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.011683559976518154, + "sampling/sampling_logp_difference/max": 4.449572563171387, + "sampling/sampling_logp_difference/mean": 0.021805983036756516, + "step": 329 + }, + { + "clip_ratio/high_max": 2.0567378214764176e-05, + "clip_ratio/high_mean": 5.141844553691044e-06, + "clip_ratio/low_mean": 1.8177100628236076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3318944840866607e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15758.0, + "completions/mean_length": 5689.2421875, + "completions/mean_terminated_length": 5432.568359375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.7778806164860725, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032866497058421373, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 287681943.0, + "reward": 0.640625, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940812587738, + "sampling/importance_sampling_ratio/min": 0.00038077132194302976, + "sampling/sampling_logp_difference/max": 7.873311519622803, + "sampling/sampling_logp_difference/mean": 0.01789461076259613, + "step": 330 + }, + { + "clip_ratio/high_max": 3.109086901531555e-05, + "clip_ratio/high_mean": 7.772717253828887e-06, + "clip_ratio/low_mean": 3.1423560130861006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919627738468989e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13820.0, + "completions/mean_length": 6288.1875, + "completions/mean_terminated_length": 6127.93701171875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.7709921672940254, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023572889622300863, + "learning_rate": 1e-05, + "loss": 0.0746, + "num_tokens": 288506735.0, + "reward": 0.484375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 0.000430915504693985, + "sampling/sampling_logp_difference/max": 7.749598503112793, + "sampling/sampling_logp_difference/mean": 0.017407266423106194, + "step": 331 + }, + { + "clip_ratio/high_max": 3.4638953366084024e-05, + "clip_ratio/high_mean": 9.51674803673086e-06, + "clip_ratio/low_mean": 6.26047980176736e-05, + "clip_ratio/low_min": 5.51267930859467e-06, + "clip_ratio/region_mean": 7.212154741864651e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 6775.0234375, + "completions/mean_terminated_length": 6465.05615234375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9338318258523941, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034220058005303144, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 289395498.0, + "reward": 0.390625, + "reward_std": 0.34533774852752686, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.0317598432302475, + "sampling/sampling_logp_difference/max": 3.449552536010742, + "sampling/sampling_logp_difference/mean": 0.019930530339479446, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.159989991123439e-05, + "clip_ratio/low_min": 1.5592839645250933e-05, + "clip_ratio/region_mean": 7.159989991123439e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 7142.9375, + "completions/mean_terminated_length": 6844.83837890625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.971405878663063, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002513247774913907, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 290329082.0, + "reward": 0.328125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 3.152207455059397e-07, + "sampling/sampling_logp_difference/max": 14.969992637634277, + "sampling/sampling_logp_difference/mean": 0.022366533055901527, + "step": 333 + }, + { + "clip_ratio/high_max": 1.6507752206962323e-05, + "clip_ratio/high_mean": 4.126938051740581e-06, + "clip_ratio/low_mean": 1.7493430505055585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1620368215735652e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15581.0, + "completions/mean_length": 6412.2109375, + "completions/mean_terminated_length": 6333.69287109375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "entropy": 0.9136044681072235, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0056767817586660385, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 291170133.0, + "reward": 0.421875, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999720454216003, + "sampling/importance_sampling_ratio/min": 0.000458698661532253, + "sampling/sampling_logp_difference/max": 7.687117099761963, + "sampling/sampling_logp_difference/mean": 0.020012658089399338, + "step": 334 + }, + { + "clip_ratio/high_max": 8.26085442895419e-06, + "clip_ratio/high_mean": 2.0652136072385474e-06, + "clip_ratio/low_mean": 3.6938338666914206e-05, + "clip_ratio/low_min": 5.699044777429663e-06, + "clip_ratio/region_mean": 3.900355193309224e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16111.0, + "completions/mean_length": 8066.1015625, + "completions/mean_terminated_length": 7797.7822265625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 1.0789504647254944, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00243841833434999, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 292222082.0, + "reward": 0.3046875, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999664425849915, + "sampling/importance_sampling_ratio/min": 8.481895929435268e-05, + "sampling/sampling_logp_difference/max": 9.374991416931152, + "sampling/sampling_logp_difference/mean": 0.023650091141462326, + "step": 335 + }, + { + "clip_ratio/high_max": 5.320054697222076e-06, + "clip_ratio/high_mean": 1.330013674305519e-06, + "clip_ratio/low_mean": 1.9117383317279746e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0447396991585265e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15176.0, + "completions/mean_length": 6836.046875, + "completions/mean_terminated_length": 6606.896484375, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 1.218759760260582, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020856577903032303, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 293115984.0, + "reward": 0.21875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 2.784526441246271e-05, + "sampling/sampling_logp_difference/max": 10.488847732543945, + "sampling/sampling_logp_difference/mean": 0.022012067958712578, + "step": 336 + }, + { + "clip_ratio/high_max": 2.5695502699818462e-05, + "clip_ratio/high_mean": 7.549717793153832e-06, + "clip_ratio/low_mean": 4.6741323160404136e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.429104089671455e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7501.9921875, + "completions/mean_terminated_length": 7140.9345703125, + "completions/min_length": 1237.0, + "completions/min_terminated_length": 1237.0, + "entropy": 0.8940394818782806, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005163854919373989, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 294099503.0, + "reward": 0.328125, + "reward_std": 0.30904707312583923, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999276399612427, + "sampling/importance_sampling_ratio/min": 0.0006545600481331348, + "sampling/sampling_logp_difference/max": 7.331547260284424, + "sampling/sampling_logp_difference/mean": 0.020813245326280594, + "step": 337 + }, + { + "clip_ratio/high_max": 3.1606674838258186e-05, + "clip_ratio/high_mean": 9.45794374729303e-06, + "clip_ratio/low_mean": 4.5567895540443715e-05, + "clip_ratio/low_min": 4.458871444512624e-06, + "clip_ratio/region_mean": 5.502583962879726e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7204.828125, + "completions/mean_terminated_length": 6908.7255859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.9961872175335884, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029277894645929337, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 295042105.0, + "reward": 0.390625, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000677108764648, + "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05, + "sampling/sampling_logp_difference/max": 10.872637748718262, + "sampling/sampling_logp_difference/mean": 0.020187582820653915, + "step": 338 + }, + { + "clip_ratio/high_max": 1.7963964182854397e-05, + "clip_ratio/high_mean": 5.194059781388205e-06, + "clip_ratio/low_mean": 1.8380221035840805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.357428081722901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15856.0, + "completions/mean_length": 6256.859375, + "completions/mean_terminated_length": 6013.80810546875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "entropy": 0.9293600022792816, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032952844630926847, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 295867039.0, + "reward": 0.46875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999649524688721, + "sampling/importance_sampling_ratio/min": 7.995560008566827e-05, + "sampling/sampling_logp_difference/max": 9.434039115905762, + "sampling/sampling_logp_difference/mean": 0.019491540268063545, + "step": 339 + }, + { + "clip_ratio/high_max": 7.577551059512189e-06, + "clip_ratio/high_mean": 1.8943877648780472e-06, + "clip_ratio/low_mean": 2.7479814093567256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9374201631071628e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15412.0, + "completions/mean_length": 7397.84375, + "completions/mean_terminated_length": 7032.552734375, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.8508890569210052, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029417150653898716, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 296832843.0, + "reward": 0.375, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000183582305908, + "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05, + "sampling/sampling_logp_difference/max": 10.93724250793457, + "sampling/sampling_logp_difference/mean": 0.01975393109023571, + "step": 340 + }, + { + "clip_ratio/high_max": 3.281225508544594e-05, + "clip_ratio/high_mean": 1.3302957199812226e-05, + "clip_ratio/low_mean": 5.109179869577929e-05, + "clip_ratio/low_min": 6.657612175331451e-06, + "clip_ratio/region_mean": 6.439475532715733e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6897.765625, + "completions/mean_terminated_length": 6823.07080078125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9046694040298462, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026788609102368355, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 297735285.0, + "reward": 0.421875, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.001710799871943891, + "sampling/sampling_logp_difference/max": 6.370794296264648, + "sampling/sampling_logp_difference/mean": 0.020578179508447647, + "step": 341 + }, + { + "clip_ratio/high_max": 1.7319889593636617e-05, + "clip_ratio/high_mean": 5.168538336874917e-06, + "clip_ratio/low_mean": 7.019768918326008e-05, + "clip_ratio/low_min": 2.541147478041239e-05, + "clip_ratio/region_mean": 7.53662266106403e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 6971.9921875, + "completions/mean_terminated_length": 6509.10595703125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8658201694488525, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005915141198784113, + "learning_rate": 1e-05, + "loss": 0.0923, + "num_tokens": 298645124.0, + "reward": 0.3984375, + "reward_std": 0.3742823898792267, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999268651008606, + "sampling/importance_sampling_ratio/min": 0.000970841443631798, + "sampling/sampling_logp_difference/max": 6.937347412109375, + "sampling/sampling_logp_difference/mean": 0.01906151883304119, + "step": 342 + }, + { + "clip_ratio/high_max": 1.8332865238335216e-05, + "clip_ratio/high_mean": 4.583216309583804e-06, + "clip_ratio/low_mean": 6.167940273371642e-05, + "clip_ratio/low_min": 5.969151516183047e-06, + "clip_ratio/region_mean": 6.626261847486603e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15054.0, + "completions/mean_length": 6545.6953125, + "completions/mean_terminated_length": 5889.80859375, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.779609851539135, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032792428974062204, + "learning_rate": 1e-05, + "loss": 0.097, + "num_tokens": 299503781.0, + "reward": 0.609375, + "reward_std": 0.38293448090553284, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999361634254456, + "sampling/importance_sampling_ratio/min": 0.002187495119869709, + "sampling/sampling_logp_difference/max": 6.124998092651367, + "sampling/sampling_logp_difference/mean": 0.017413027584552765, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.46246323235755e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.46246323235755e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7226.515625, + "completions/mean_terminated_length": 7006.736328125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9573849961161613, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005092279519885778, + "learning_rate": 1e-05, + "loss": 0.1102, + "num_tokens": 300447903.0, + "reward": 0.5390625, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999373555183411, + "sampling/importance_sampling_ratio/min": 0.000627054600045085, + "sampling/sampling_logp_difference/max": 7.374476909637451, + "sampling/sampling_logp_difference/mean": 0.021570835262537003, + "step": 344 + }, + { + "clip_ratio/high_max": 5.487269390869187e-06, + "clip_ratio/high_mean": 1.3718173477172968e-06, + "clip_ratio/low_mean": 4.7280102080549113e-05, + "clip_ratio/low_min": 1.0166083029616857e-05, + "clip_ratio/region_mean": 4.865191931457957e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14967.0, + "completions/mean_length": 5755.171875, + "completions/mean_terminated_length": 5323.10546875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8482184633612633, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005033228080719709, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 301206021.0, + "reward": 0.390625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.0014573346124961972, + "sampling/sampling_logp_difference/max": 6.531146049499512, + "sampling/sampling_logp_difference/mean": 0.018870476633310318, + "step": 345 + }, + { + "clip_ratio/high_max": 5.421346941147931e-06, + "clip_ratio/high_mean": 1.3553367352869827e-06, + "clip_ratio/low_mean": 1.6510994441887306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.786633117717429e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 7098.7265625, + "completions/mean_terminated_length": 6875.88037109375, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "entropy": 0.87320177257061, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007659573573619127, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 302133890.0, + "reward": 0.421875, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0012466582702472806, + "sampling/sampling_logp_difference/max": 6.687288761138916, + "sampling/sampling_logp_difference/mean": 0.019994346424937248, + "step": 346 + }, + { + "clip_ratio/high_max": 1.1556229310372146e-05, + "clip_ratio/high_mean": 2.8890573275930365e-06, + "clip_ratio/low_mean": 3.8744643916288624e-05, + "clip_ratio/low_min": 6.108287834649673e-06, + "clip_ratio/region_mean": 4.1633702039689524e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16139.0, + "completions/mean_length": 6399.96875, + "completions/mean_terminated_length": 6077.90283203125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9481896534562111, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014135175151750445, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 302972566.0, + "reward": 0.4140625, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0025698256213217974, + "sampling/sampling_logp_difference/max": 5.963917255401611, + "sampling/sampling_logp_difference/mean": 0.02073008380830288, + "step": 347 + }, + { + "clip_ratio/high_max": 6.59491388432798e-06, + "clip_ratio/high_mean": 2.545892130001448e-06, + "clip_ratio/low_mean": 4.620846755187813e-05, + "clip_ratio/low_min": 6.243132702365983e-06, + "clip_ratio/region_mean": 4.875435956819274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 7298.078125, + "completions/mean_terminated_length": 7226.53564453125, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "entropy": 0.8719206526875496, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027898226398974657, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 303925976.0, + "reward": 0.484375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.005236432887613773, + "sampling/sampling_logp_difference/max": 5.252114772796631, + "sampling/sampling_logp_difference/mean": 0.020944103598594666, + "step": 348 + }, + { + "clip_ratio/high_max": 1.052124343914329e-05, + "clip_ratio/high_mean": 2.6303108597858227e-06, + "clip_ratio/low_mean": 2.010384196182713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.273415248055244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14980.0, + "completions/mean_length": 5667.0390625, + "completions/mean_terminated_length": 5496.9287109375, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "entropy": 0.8791451379656792, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012764945859089494, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 304675157.0, + "reward": 0.390625, + "reward_std": 0.17965976893901825, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000383853912354, + "sampling/importance_sampling_ratio/min": 5.054428584116977e-06, + "sampling/sampling_logp_difference/max": 12.195245742797852, + "sampling/sampling_logp_difference/mean": 0.018928447738289833, + "step": 349 + }, + { + "clip_ratio/high_max": 9.578045592206763e-06, + "clip_ratio/high_mean": 2.3945113980516908e-06, + "clip_ratio/low_mean": 3.1114799753595435e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350931149270764e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15354.0, + "completions/max_terminated_length": 15354.0, + "completions/mean_length": 5874.4453125, + "completions/mean_terminated_length": 5874.4453125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9577538818120956, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00509974779561162, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 305447038.0, + "reward": 0.515625, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999423027038574, + "sampling/importance_sampling_ratio/min": 0.004791648127138615, + "sampling/sampling_logp_difference/max": 5.340880870819092, + "sampling/sampling_logp_difference/mean": 0.02114470861852169, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0903062275247066e-05, + "clip_ratio/high_mean": 2.7257655688117666e-06, + "clip_ratio/low_mean": 4.784364205079328e-05, + "clip_ratio/low_min": 3.861600362142781e-06, + "clip_ratio/region_mean": 5.056940744907479e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 6197.5703125, + "completions/mean_terminated_length": 6035.88134765625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.8665244281291962, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030849494505673647, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 306258023.0, + "reward": 0.515625, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998056888580322, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.021017421036958694, + "step": 351 + }, + { + "clip_ratio/high_max": 1.4299712574938894e-05, + "clip_ratio/high_mean": 4.3520980170796975e-06, + "clip_ratio/low_mean": 6.213493452378316e-05, + "clip_ratio/low_min": 1.0056635801447555e-05, + "clip_ratio/region_mean": 6.648703174505499e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7522.578125, + "completions/mean_terminated_length": 7381.9208984375, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.8185881152749062, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002946985885500908, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 307240305.0, + "reward": 0.3125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.005127199459820986, + "sampling/sampling_logp_difference/max": 5.273195743560791, + "sampling/sampling_logp_difference/mean": 0.01965932548046112, + "step": 352 + }, + { + "clip_ratio/high_max": 1.693051035545068e-05, + "clip_ratio/high_mean": 5.08456730585749e-06, + "clip_ratio/low_mean": 4.2052345861520735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.713691282631771e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14090.0, + "completions/mean_length": 6403.2265625, + "completions/mean_terminated_length": 6163.6884765625, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "entropy": 0.8359840363264084, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031181599479168653, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 308079318.0, + "reward": 0.5, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999215602874756, + "sampling/importance_sampling_ratio/min": 6.73715621815063e-05, + "sampling/sampling_logp_difference/max": 9.605287551879883, + "sampling/sampling_logp_difference/mean": 0.01963040418922901, + "step": 353 + }, + { + "clip_ratio/high_max": 1.3988919135954347e-05, + "clip_ratio/high_mean": 3.497229783988587e-06, + "clip_ratio/low_mean": 6.722658486069122e-05, + "clip_ratio/low_min": 1.858519090092159e-05, + "clip_ratio/region_mean": 7.072381458783639e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7954.03125, + "completions/mean_terminated_length": 7751.71240234375, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.905990719795227, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002656223252415657, + "learning_rate": 1e-05, + "loss": 0.1022, + "num_tokens": 309117770.0, + "reward": 0.3828125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999536275863647, + "sampling/importance_sampling_ratio/min": 0.0003354826185386628, + "sampling/sampling_logp_difference/max": 7.999940395355225, + "sampling/sampling_logp_difference/mean": 0.020741507411003113, + "step": 354 + }, + { + "clip_ratio/high_max": 1.7610595023143105e-05, + "clip_ratio/high_mean": 4.402648755785776e-06, + "clip_ratio/low_mean": 4.337988764291367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.778253651238629e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6630.09375, + "completions/mean_terminated_length": 6315.45166015625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.870736837387085, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0060529084876179695, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 309988894.0, + "reward": 0.515625, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998822212219238, + "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05, + "sampling/sampling_logp_difference/max": 10.716434478759766, + "sampling/sampling_logp_difference/mean": 0.02060208097100258, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0448093235027045e-05, + "clip_ratio/high_mean": 2.6120233087567613e-06, + "clip_ratio/low_mean": 3.1030769946482906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364279325523967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15920.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6679.6171875, + "completions/mean_terminated_length": 6679.6171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9812518879771233, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00400698184967041, + "learning_rate": 1e-05, + "loss": 0.0605, + "num_tokens": 310864013.0, + "reward": 0.421875, + "reward_std": 0.3295465111732483, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999049305915833, + "sampling/importance_sampling_ratio/min": 0.0020593837834894657, + "sampling/sampling_logp_difference/max": 6.1853485107421875, + "sampling/sampling_logp_difference/mean": 0.02098071575164795, + "step": 356 + }, + { + "clip_ratio/high_max": 2.124982574969181e-05, + "clip_ratio/high_mean": 7.736592579021817e-06, + "clip_ratio/low_mean": 2.900951585615985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.674610888992902e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14541.0, + "completions/mean_length": 5523.796875, + "completions/mean_terminated_length": 5173.4677734375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9120645374059677, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005929585546255112, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 311589987.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998446702957153, + "sampling/importance_sampling_ratio/min": 0.0010661041596904397, + "sampling/sampling_logp_difference/max": 6.843744277954102, + "sampling/sampling_logp_difference/mean": 0.019948206841945648, + "step": 357 + }, + { + "clip_ratio/high_max": 2.4486997745043482e-05, + "clip_ratio/high_mean": 8.219769085826556e-06, + "clip_ratio/low_mean": 5.346400575945154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.168377467474784e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15401.0, + "completions/mean_length": 6361.3671875, + "completions/mean_terminated_length": 6282.44873046875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.8044678047299385, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006622390355914831, + "learning_rate": 1e-05, + "loss": 0.1023, + "num_tokens": 312424034.0, + "reward": 0.5078125, + "reward_std": 0.3724474310874939, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.0003157092141918838, + "sampling/sampling_logp_difference/max": 8.060688972473145, + "sampling/sampling_logp_difference/mean": 0.018907658755779266, + "step": 358 + }, + { + "clip_ratio/high_max": 1.0407376748844399e-05, + "clip_ratio/high_mean": 2.6018441872110998e-06, + "clip_ratio/low_mean": 5.925514369664597e-05, + "clip_ratio/low_min": 1.3324347946763737e-05, + "clip_ratio/region_mean": 6.185698703120579e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 7109.0, + "completions/mean_terminated_length": 7035.96826171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9167275875806808, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004639944992959499, + "learning_rate": 1e-05, + "loss": 0.0861, + "num_tokens": 313353346.0, + "reward": 0.4140625, + "reward_std": 0.3826971650123596, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999389052391052, + "sampling/importance_sampling_ratio/min": 0.0019070414127781987, + "sampling/sampling_logp_difference/max": 6.262202262878418, + "sampling/sampling_logp_difference/mean": 0.02155841514468193, + "step": 359 + }, + { + "clip_ratio/high_max": 3.959046694035351e-05, + "clip_ratio/high_mean": 1.0912523691786191e-05, + "clip_ratio/low_mean": 3.3944450819944905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.485697365907981e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6314.2734375, + "completions/mean_terminated_length": 6072.60009765625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.8780038207769394, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007643720600754023, + "learning_rate": 1e-05, + "loss": 0.0873, + "num_tokens": 314180717.0, + "reward": 0.4609375, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999802112579346, + "sampling/importance_sampling_ratio/min": 0.021285315975546837, + "sampling/sampling_logp_difference/max": 3.8497378826141357, + "sampling/sampling_logp_difference/mean": 0.01964358240365982, + "step": 360 + }, + { + "clip_ratio/high_max": 3.065382111344661e-05, + "clip_ratio/high_mean": 9.187473835936544e-06, + "clip_ratio/low_mean": 4.137891801292426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.056639065514901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6718.2265625, + "completions/mean_terminated_length": 6486.24853515625, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8326799497008324, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050973957404494286, + "learning_rate": 1e-05, + "loss": 0.0109, + "num_tokens": 315060842.0, + "reward": 0.5078125, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.0009130688849836588, + "sampling/sampling_logp_difference/max": 6.998699188232422, + "sampling/sampling_logp_difference/mean": 0.019501537084579468, + "step": 361 + }, + { + "clip_ratio/high_max": 8.624853762739804e-06, + "clip_ratio/high_mean": 2.156213440684951e-06, + "clip_ratio/low_mean": 1.8797969062234188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0954182048171788e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 8666.8359375, + "completions/mean_terminated_length": 7941.291015625, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.9526705741882324, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019092690199613571, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 316190325.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999814629554749, + "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05, + "sampling/sampling_logp_difference/max": 10.249995231628418, + "sampling/sampling_logp_difference/mean": 0.02051631174981594, + "step": 362 + }, + { + "clip_ratio/high_max": 2.147400391550036e-05, + "clip_ratio/high_mean": 6.434908300434472e-06, + "clip_ratio/low_mean": 3.521234066283796e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.164724816746457e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15164.0, + "completions/mean_length": 7661.8203125, + "completions/mean_terminated_length": 7002.16015625, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.8322782590985298, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019530428107827902, + "learning_rate": 1e-05, + "loss": 0.0729, + "num_tokens": 317191878.0, + "reward": 0.4609375, + "reward_std": 0.21382391452789307, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 8.546619210392237e-05, + "sampling/sampling_logp_difference/max": 9.367389678955078, + "sampling/sampling_logp_difference/mean": 0.019894573837518692, + "step": 363 + }, + { + "clip_ratio/high_max": 1.9436202364886412e-05, + "clip_ratio/high_mean": 6.089704697842535e-06, + "clip_ratio/low_mean": 4.2698405422925134e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.878810955233348e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 7024.859375, + "completions/mean_terminated_length": 6800.240234375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.794853538274765, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031784537713974714, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 318109004.0, + "reward": 0.4921875, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352693557739, + "sampling/importance_sampling_ratio/min": 0.0002962362195830792, + "sampling/sampling_logp_difference/max": 8.124353408813477, + "sampling/sampling_logp_difference/mean": 0.018519200384616852, + "step": 364 + }, + { + "clip_ratio/high_max": 4.127455667912727e-06, + "clip_ratio/high_mean": 1.0318639169781818e-06, + "clip_ratio/low_mean": 4.342453667049995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.445640047379129e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 7282.1796875, + "completions/mean_terminated_length": 6912.1865234375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.904067650437355, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005080109462141991, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 319059075.0, + "reward": 0.4140625, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062108039856, + "sampling/importance_sampling_ratio/min": 0.1194523349404335, + "sampling/sampling_logp_difference/max": 6.136754989624023, + "sampling/sampling_logp_difference/mean": 0.019978653639554977, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.608940076243016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.608940076243016e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15625.0, + "completions/mean_length": 7131.5234375, + "completions/mean_terminated_length": 6596.255859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.8849587142467499, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022667953744530678, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 319990046.0, + "reward": 0.46875, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0370909757912159, + "sampling/sampling_logp_difference/max": 3.294381618499756, + "sampling/sampling_logp_difference/mean": 0.02037571743130684, + "step": 366 + }, + { + "clip_ratio/high_max": 1.5356635913121863e-05, + "clip_ratio/high_mean": 3.839158978280466e-06, + "clip_ratio/low_mean": 3.4950805911648786e-05, + "clip_ratio/low_min": 4.876336333836662e-06, + "clip_ratio/region_mean": 3.8789965287833184e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 6655.4453125, + "completions/mean_terminated_length": 6578.84228515625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.7417122721672058, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00216497085057199, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 320860135.0, + "reward": 0.5625, + "reward_std": 0.3369230031967163, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0005190494703128934, + "sampling/sampling_logp_difference/max": 7.563511371612549, + "sampling/sampling_logp_difference/mean": 0.01771342009305954, + "step": 367 + }, + { + "clip_ratio/high_max": 1.7605634639039636e-05, + "clip_ratio/high_mean": 5.297029474604642e-06, + "clip_ratio/low_mean": 5.688933060810086e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.218636053745286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15849.0, + "completions/mean_length": 7077.1640625, + "completions/mean_terminated_length": 6619.45068359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.8749325424432755, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0028338562697172165, + "learning_rate": 1e-05, + "loss": 0.0643, + "num_tokens": 321783852.0, + "reward": 0.3828125, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998220205307007, + "sampling/importance_sampling_ratio/min": 7.83290306571871e-06, + "sampling/sampling_logp_difference/max": 11.757177352905273, + "sampling/sampling_logp_difference/mean": 0.020299233496189117, + "step": 368 + }, + { + "clip_ratio/high_max": 7.301828190975357e-06, + "clip_ratio/high_mean": 1.8254570477438392e-06, + "clip_ratio/low_mean": 5.158197632226802e-05, + "clip_ratio/low_min": 3.735804057214409e-06, + "clip_ratio/region_mean": 5.340743223314348e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6034.296875, + "completions/mean_terminated_length": 5525.294921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.80014718323946, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022897711023688316, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 322572882.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999347925186157, + "sampling/importance_sampling_ratio/min": 0.0004105660773348063, + "sampling/sampling_logp_difference/max": 7.7979736328125, + "sampling/sampling_logp_difference/mean": 0.01858348958194256, + "step": 369 + }, + { + "clip_ratio/high_max": 9.364057859784225e-06, + "clip_ratio/high_mean": 3.351393047523743e-06, + "clip_ratio/low_mean": 4.186752630630508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5218919240141986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 8172.109375, + "completions/mean_terminated_length": 7838.29248046875, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "entropy": 0.8732693120837212, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003263789461925626, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 323640904.0, + "reward": 0.2890625, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999354481697083, + "sampling/importance_sampling_ratio/min": 9.27252222027164e-06, + "sampling/sampling_logp_difference/max": 11.588455200195312, + "sampling/sampling_logp_difference/mean": 0.0208889190107584, + "step": 370 + }, + { + "clip_ratio/high_max": 2.0998899799451465e-05, + "clip_ratio/high_mean": 6.692962131182867e-06, + "clip_ratio/low_mean": 4.261424010110204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930720297124935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 7699.203125, + "completions/mean_terminated_length": 7419.04833984375, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.8296505436301231, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0042716520838439465, + "learning_rate": 1e-05, + "loss": 0.0937, + "num_tokens": 324643858.0, + "reward": 0.4921875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874234199524, + "sampling/importance_sampling_ratio/min": 0.00022192654432728887, + "sampling/sampling_logp_difference/max": 8.413164138793945, + "sampling/sampling_logp_difference/mean": 0.018926654011011124, + "step": 371 + }, + { + "clip_ratio/high_max": 7.061349151626928e-06, + "clip_ratio/high_mean": 1.765337287906732e-06, + "clip_ratio/low_mean": 4.5005243464402156e-05, + "clip_ratio/low_min": 3.861838649754645e-06, + "clip_ratio/region_mean": 4.6770580411248375e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 7450.1640625, + "completions/mean_terminated_length": 7450.1640625, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 1.0400195196270943, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033558050636202097, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 325617687.0, + "reward": 0.2578125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999459385871887, + "sampling/importance_sampling_ratio/min": 0.039920732378959656, + "sampling/sampling_logp_difference/max": 3.2208595275878906, + "sampling/sampling_logp_difference/mean": 0.02249298244714737, + "step": 372 + }, + { + "clip_ratio/high_max": 1.3147802746971138e-05, + "clip_ratio/high_mean": 3.2869506867427845e-06, + "clip_ratio/low_mean": 2.4451034505545977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7737984851228248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15342.0, + "completions/mean_length": 6799.0703125, + "completions/mean_terminated_length": 6723.5986328125, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9737623482942581, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005797459278255701, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 326508384.0, + "reward": 0.3125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321699142456, + "sampling/importance_sampling_ratio/min": 7.535634836131067e-07, + "sampling/sampling_logp_difference/max": 14.0984525680542, + "sampling/sampling_logp_difference/mean": 0.021543748676776886, + "step": 373 + }, + { + "clip_ratio/high_max": 3.3594023989280686e-06, + "clip_ratio/high_mean": 8.398505997320171e-07, + "clip_ratio/low_mean": 2.3457610382138228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4297460981870245e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 7034.3671875, + "completions/mean_terminated_length": 6654.30078125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8749603256583214, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002258980879560113, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 327426407.0, + "reward": 0.4609375, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.008719252422451973, + "sampling/sampling_logp_difference/max": 4.742221832275391, + "sampling/sampling_logp_difference/mean": 0.01997346058487892, + "step": 374 + }, + { + "clip_ratio/high_max": 2.823375348270929e-05, + "clip_ratio/high_mean": 7.058438370677322e-06, + "clip_ratio/low_mean": 4.9395109726901865e-05, + "clip_ratio/low_min": 1.636556044104509e-05, + "clip_ratio/region_mean": 5.6453548268109444e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15240.0, + "completions/mean_length": 6623.078125, + "completions/mean_terminated_length": 6388.81640625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.858784057199955, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002420129720121622, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 328292985.0, + "reward": 0.4140625, + "reward_std": 0.3077537417411804, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 0.00014900295354891568, + "sampling/sampling_logp_difference/max": 8.811544418334961, + "sampling/sampling_logp_difference/mean": 0.019645996391773224, + "step": 375 + }, + { + "clip_ratio/high_max": 1.8078507309837732e-05, + "clip_ratio/high_mean": 6.468551191574079e-06, + "clip_ratio/low_mean": 4.051302585139638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.698157727034413e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15229.0, + "completions/mean_length": 5902.4765625, + "completions/mean_terminated_length": 5564.36279296875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.904740035533905, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004107976797968149, + "learning_rate": 1e-05, + "loss": 0.0824, + "num_tokens": 329067006.0, + "reward": 0.5546875, + "reward_std": 0.3945493996143341, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05, + "sampling/sampling_logp_difference/max": 11.37439250946045, + "sampling/sampling_logp_difference/mean": 0.019582755863666534, + "step": 376 + }, + { + "clip_ratio/high_max": 2.553658168835682e-05, + "clip_ratio/high_mean": 7.276365181496658e-06, + "clip_ratio/low_mean": 1.7552573126522475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.482893796695862e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6425.6015625, + "completions/mean_terminated_length": 6267.5322265625, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.964553713798523, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003208522219210863, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 329910691.0, + "reward": 0.359375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999419450759888, + "sampling/importance_sampling_ratio/min": 0.00137569778598845, + "sampling/sampling_logp_difference/max": 6.588794231414795, + "sampling/sampling_logp_difference/mean": 0.021154657006263733, + "step": 377 + }, + { + "clip_ratio/high_max": 6.8712420215888415e-06, + "clip_ratio/high_mean": 1.7178105053972104e-06, + "clip_ratio/low_mean": 4.0991827404468495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2709637853022286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 8006.4453125, + "completions/mean_terminated_length": 7594.43408203125, + "completions/min_length": 1235.0, + "completions/min_terminated_length": 1235.0, + "entropy": 0.8980336412787437, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002898421371355653, + "learning_rate": 1e-05, + "loss": 0.0815, + "num_tokens": 330956332.0, + "reward": 0.4296875, + "reward_std": 0.20175684988498688, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 9.378339746035635e-05, + "sampling/sampling_logp_difference/max": 9.27452278137207, + "sampling/sampling_logp_difference/mean": 0.021021340042352676, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2689344689297286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2689344689297286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15484.0, + "completions/max_terminated_length": 15484.0, + "completions/mean_length": 7068.828125, + "completions/mean_terminated_length": 7068.828125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.9865007549524307, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0037063576746731997, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 331880918.0, + "reward": 0.3203125, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0001819290773710236, + "sampling/sampling_logp_difference/max": 8.611893653869629, + "sampling/sampling_logp_difference/mean": 0.02072504535317421, + "step": 379 + }, + { + "clip_ratio/high_max": 5.845633268108941e-06, + "clip_ratio/high_mean": 1.4614083170272352e-06, + "clip_ratio/low_mean": 3.207486906831036e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353627721480734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 7379.390625, + "completions/mean_terminated_length": 7236.4609375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.8977236375212669, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001972826896235347, + "learning_rate": 1e-05, + "loss": 0.0228, + "num_tokens": 332849112.0, + "reward": 0.4140625, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 2.820451663865242e-05, + "sampling/sampling_logp_difference/max": 10.476028442382812, + "sampling/sampling_logp_difference/mean": 0.019411223009228706, + "step": 380 + }, + { + "clip_ratio/high_max": 4.875385002378607e-06, + "clip_ratio/high_mean": 1.2188462505946518e-06, + "clip_ratio/low_mean": 2.3530714997832547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.47495612484272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15517.0, + "completions/mean_length": 6867.9609375, + "completions/mean_terminated_length": 6793.03125, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "entropy": 0.9244343340396881, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.006926023401319981, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333746179.0, + "reward": 0.4140625, + "reward_std": 0.1433562934398651, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.0003875594411510974, + "sampling/sampling_logp_difference/max": 7.8556413650512695, + "sampling/sampling_logp_difference/mean": 0.020311862230300903, + "step": 381 + }, + { + "clip_ratio/high_max": 1.5651628245905158e-05, + "clip_ratio/high_mean": 4.836261211949022e-06, + "clip_ratio/low_mean": 5.268017821435933e-05, + "clip_ratio/low_min": 3.950945028918795e-06, + "clip_ratio/region_mean": 5.751643902840442e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 7525.375, + "completions/mean_terminated_length": 6855.3955078125, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9207312315702438, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0047226278111338615, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 334731027.0, + "reward": 0.3359375, + "reward_std": 0.3353874683380127, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999615550041199, + "sampling/importance_sampling_ratio/min": 0.00029753465787507594, + "sampling/sampling_logp_difference/max": 8.119979858398438, + "sampling/sampling_logp_difference/mean": 0.021496692672371864, + "step": 382 + }, + { + "clip_ratio/high_max": 3.815379886873416e-05, + "clip_ratio/high_mean": 9.53844971718354e-06, + "clip_ratio/low_mean": 4.519663821156428e-05, + "clip_ratio/low_min": 2.775434040813707e-06, + "clip_ratio/region_mean": 5.473508826980833e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16251.0, + "completions/mean_length": 6841.0625, + "completions/mean_terminated_length": 6453.13818359375, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.8979457840323448, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004971448332071304, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 335631243.0, + "reward": 0.390625, + "reward_std": 0.2596156895160675, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999934196472168, + "sampling/importance_sampling_ratio/min": 9.655764188210014e-06, + "sampling/sampling_logp_difference/max": 11.547955513000488, + "sampling/sampling_logp_difference/mean": 0.020256079733371735, + "step": 383 + }, + { + "clip_ratio/high_max": 4.162365712545579e-06, + "clip_ratio/high_mean": 1.0405914281363948e-06, + "clip_ratio/low_mean": 3.1563491688757495e-05, + "clip_ratio/low_min": 3.1228139505401487e-06, + "clip_ratio/region_mean": 3.260408311689389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15060.0, + "completions/mean_length": 6919.8046875, + "completions/mean_terminated_length": 6454.35205078125, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9241961911320686, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038604787550866604, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 336537162.0, + "reward": 0.375, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998080730438232, + "sampling/importance_sampling_ratio/min": 0.0009118975722230971, + "sampling/sampling_logp_difference/max": 6.999982833862305, + "sampling/sampling_logp_difference/mean": 0.02030865103006363, + "step": 384 + }, + { + "clip_ratio/high_max": 6.5182248363271356e-06, + "clip_ratio/high_mean": 1.6295562090817839e-06, + "clip_ratio/low_mean": 4.3847362121596234e-05, + "clip_ratio/low_min": 6.294533704931382e-06, + "clip_ratio/region_mean": 4.547691833067802e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15692.0, + "completions/mean_length": 7679.390625, + "completions/mean_terminated_length": 7099.08349609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 1.0165777206420898, + "epoch": 0.35418583256669733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004624314606189728, + "learning_rate": 1e-05, + "loss": 0.0849, + "num_tokens": 337542492.0, + "reward": 0.3046875, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999251961708069, + "sampling/importance_sampling_ratio/min": 5.83546279813163e-05, + "sampling/sampling_logp_difference/max": 9.748971939086914, + "sampling/sampling_logp_difference/mean": 0.02206476218998432, + "step": 385 + }, + { + "clip_ratio/high_max": 6.00499606662197e-06, + "clip_ratio/high_mean": 1.5012490166554926e-06, + "clip_ratio/low_mean": 3.392923713363416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.543048615028965e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 5957.5859375, + "completions/mean_terminated_length": 5792.08740234375, + "completions/min_length": 1705.0, + "completions/min_terminated_length": 1705.0, + "entropy": 0.7705951780080795, + "epoch": 0.35510579576816925, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021966886706650257, + "learning_rate": 1e-05, + "loss": 0.0789, + "num_tokens": 338324279.0, + "reward": 0.53125, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999998927116394, + "sampling/importance_sampling_ratio/min": 0.0008041196851991117, + "sampling/sampling_logp_difference/max": 7.125762462615967, + "sampling/sampling_logp_difference/mean": 0.01804077997803688, + "step": 386 + }, + { + "clip_ratio/high_max": 1.5711350215497077e-05, + "clip_ratio/high_mean": 3.927837553874269e-06, + "clip_ratio/low_mean": 5.276240381135722e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.669024130838807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7269.8046875, + "completions/mean_terminated_length": 7198.03955078125, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 1.0025205165147781, + "epoch": 0.3560257589696412, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001694107661023736, + "learning_rate": 1e-05, + "loss": 0.134, + "num_tokens": 339274662.0, + "reward": 0.3359375, + "reward_std": 0.30487072467803955, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039769172668, + "sampling/importance_sampling_ratio/min": 0.0015677008777856827, + "sampling/sampling_logp_difference/max": 6.4581451416015625, + "sampling/sampling_logp_difference/mean": 0.021742526441812515, + "step": 387 + }, + { + "clip_ratio/high_max": 7.005848829066963e-06, + "clip_ratio/high_mean": 1.7514622072667407e-06, + "clip_ratio/low_mean": 5.100632029098051e-05, + "clip_ratio/low_min": 8.934973720897688e-06, + "clip_ratio/region_mean": 5.275778244140383e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7643.8359375, + "completions/mean_terminated_length": 7288.54443359375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "entropy": 0.7936615869402885, + "epoch": 0.35694572217111314, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004587972536683083, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 340272689.0, + "reward": 0.5078125, + "reward_std": 0.35324612259864807, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999613761901855, + "sampling/importance_sampling_ratio/min": 0.0007390327518805861, + "sampling/sampling_logp_difference/max": 7.210168361663818, + "sampling/sampling_logp_difference/mean": 0.01862112432718277, + "step": 388 + }, + { + "clip_ratio/high_max": 1.0522736374696251e-05, + "clip_ratio/high_mean": 2.6306840936740628e-06, + "clip_ratio/low_mean": 2.139122614153166e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4021910121518886e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14401.0, + "completions/mean_length": 7068.734375, + "completions/mean_terminated_length": 6610.60595703125, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "entropy": 0.8858344480395317, + "epoch": 0.3578656853725851, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00245783943682909, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 341195599.0, + "reward": 0.4609375, + "reward_std": 0.21594557166099548, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957263469696, + "sampling/importance_sampling_ratio/min": 1.526316918898374e-05, + "sampling/sampling_logp_difference/max": 11.090067863464355, + "sampling/sampling_logp_difference/mean": 0.019989900290966034, + "step": 389 + }, + { + "clip_ratio/high_max": 5.272259386401856e-06, + "clip_ratio/high_mean": 1.318064846600464e-06, + "clip_ratio/low_mean": 2.2939096254503966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4257160987417592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15788.0, + "completions/mean_length": 6093.296875, + "completions/mean_terminated_length": 5929.95263671875, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.9640207663178444, + "epoch": 0.35878564857405704, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0067657483741641045, + "learning_rate": 1e-05, + "loss": 0.0181, + "num_tokens": 341993565.0, + "reward": 0.4453125, + "reward_std": 0.12415502220392227, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998992681503296, + "sampling/importance_sampling_ratio/min": 0.010459281504154205, + "sampling/sampling_logp_difference/max": 4.56026554107666, + "sampling/sampling_logp_difference/mean": 0.02037961222231388, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.566248594528588e-05, + "clip_ratio/low_min": 4.402028480399167e-06, + "clip_ratio/region_mean": 4.566248594528588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16170.0, + "completions/max_terminated_length": 16170.0, + "completions/mean_length": 7620.09375, + "completions/mean_terminated_length": 7620.09375, + "completions/min_length": 1076.0, + "completions/min_terminated_length": 1076.0, + "entropy": 0.9773544892668724, + "epoch": 0.35970561177552896, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018817185191437602, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 342990545.0, + "reward": 0.3046875, + "reward_std": 0.18755048513412476, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0006883936002850533, + "sampling/sampling_logp_difference/max": 7.281149864196777, + "sampling/sampling_logp_difference/mean": 0.021528441458940506, + "step": 391 + }, + { + "clip_ratio/high_max": 2.6727505428425502e-05, + "clip_ratio/high_mean": 7.985045499481203e-06, + "clip_ratio/low_mean": 7.762144696243922e-05, + "clip_ratio/low_min": 2.4772080450929934e-05, + "clip_ratio/region_mean": 8.560649303035461e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15053.0, + "completions/mean_length": 6963.984375, + "completions/mean_terminated_length": 6737.904296875, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.9683744385838509, + "epoch": 0.36062557497700093, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052104732021689415, + "learning_rate": 1e-05, + "loss": 0.087, + "num_tokens": 343898791.0, + "reward": 0.4296875, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324679374695, + "sampling/importance_sampling_ratio/min": 0.010815954767167568, + "sampling/sampling_logp_difference/max": 4.526732921600342, + "sampling/sampling_logp_difference/mean": 0.021434593945741653, + "step": 392 + }, + { + "clip_ratio/high_max": 1.3545108686230378e-05, + "clip_ratio/high_mean": 4.365133804640209e-06, + "clip_ratio/low_mean": 2.5377692509209737e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9742826200163108e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15116.0, + "completions/mean_length": 6718.5078125, + "completions/mean_terminated_length": 6642.4013671875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9043834507465363, + "epoch": 0.36154553817847285, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005151392426341772, + "learning_rate": 1e-05, + "loss": 0.0085, + "num_tokens": 344779672.0, + "reward": 0.4921875, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999840497970581, + "sampling/importance_sampling_ratio/min": 0.0024171893019229174, + "sampling/sampling_logp_difference/max": 6.025149822235107, + "sampling/sampling_logp_difference/mean": 0.0201373603194952, + "step": 393 + }, + { + "clip_ratio/high_max": 1.2263486723895767e-05, + "clip_ratio/high_mean": 3.927679188109323e-06, + "clip_ratio/low_mean": 2.739263118201052e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132031042696326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 7044.640625, + "completions/mean_terminated_length": 6820.49609375, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "entropy": 0.9017335474491119, + "epoch": 0.3624655013799448, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026606651954352856, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 345701722.0, + "reward": 0.3125, + "reward_std": 0.24146249890327454, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05, + "sampling/sampling_logp_difference/max": 10.157968521118164, + "sampling/sampling_logp_difference/mean": 0.01981864869594574, + "step": 394 + }, + { + "clip_ratio/high_max": 1.026556356009678e-05, + "clip_ratio/high_mean": 2.566390890024195e-06, + "clip_ratio/low_mean": 4.819571529424138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0762106297952414e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15476.0, + "completions/mean_length": 6031.875, + "completions/mean_terminated_length": 5950.3623046875, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.8537683561444283, + "epoch": 0.36338546458141674, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003957017324864864, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 346492810.0, + "reward": 0.4296875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999707341194153, + "sampling/importance_sampling_ratio/min": 0.0015133036067709327, + "sampling/sampling_logp_difference/max": 6.493460178375244, + "sampling/sampling_logp_difference/mean": 0.018711457028985023, + "step": 395 + }, + { + "clip_ratio/high_max": 5.870488848813693e-06, + "clip_ratio/high_mean": 1.4676222122034233e-06, + "clip_ratio/low_mean": 3.637038832948747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.783801014378696e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 7429.3515625, + "completions/mean_terminated_length": 6911.31396484375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.8821266070008278, + "epoch": 0.36430542778288866, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002122648525983095, + "learning_rate": 1e-05, + "loss": 0.1257, + "num_tokens": 347462871.0, + "reward": 0.453125, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000076293945312, + "sampling/importance_sampling_ratio/min": 0.00014005196862854064, + "sampling/sampling_logp_difference/max": 8.873497009277344, + "sampling/sampling_logp_difference/mean": 0.01998838409781456, + "step": 396 + }, + { + "clip_ratio/high_max": 1.0663932243915042e-05, + "clip_ratio/high_mean": 2.6659830609787605e-06, + "clip_ratio/low_mean": 6.443337406381033e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.709935701110226e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15761.0, + "completions/mean_length": 7131.7109375, + "completions/mean_terminated_length": 6833.25, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.8575824722647667, + "epoch": 0.36522539098436063, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002546454081311822, + "learning_rate": 1e-05, + "loss": 0.0676, + "num_tokens": 348395842.0, + "reward": 0.4921875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999964714050293, + "sampling/importance_sampling_ratio/min": 0.0002167800412280485, + "sampling/sampling_logp_difference/max": 8.436627388000488, + "sampling/sampling_logp_difference/mean": 0.0193922221660614, + "step": 397 + }, + { + "clip_ratio/high_max": 3.847337666229578e-06, + "clip_ratio/high_mean": 9.618344165573944e-07, + "clip_ratio/low_mean": 3.932982110654848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.029165563679271e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16200.0, + "completions/mean_length": 6858.34375, + "completions/mean_terminated_length": 6707.14306640625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.9539813920855522, + "epoch": 0.36614535418583255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00492837093770504, + "learning_rate": 1e-05, + "loss": 0.0818, + "num_tokens": 349292790.0, + "reward": 0.390625, + "reward_std": 0.1949220597743988, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998850226402283, + "sampling/importance_sampling_ratio/min": 0.0011153683299198747, + "sampling/sampling_logp_difference/max": 6.79857063293457, + "sampling/sampling_logp_difference/mean": 0.020318543538451195, + "step": 398 + }, + { + "clip_ratio/high_max": 1.291372609557584e-05, + "clip_ratio/high_mean": 3.22843152389396e-06, + "clip_ratio/low_mean": 3.8245348378040944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1473780811429606e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15261.0, + "completions/mean_length": 7809.984375, + "completions/mean_terminated_length": 7533.40283203125, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.8353303670883179, + "epoch": 0.3670653173873045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004895905964076519, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 350312556.0, + "reward": 0.3203125, + "reward_std": 0.22567616403102875, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999260306358337, + "sampling/importance_sampling_ratio/min": 0.0008417933131568134, + "sampling/sampling_logp_difference/max": 7.0799760818481445, + "sampling/sampling_logp_difference/mean": 0.018754083663225174, + "step": 399 + }, + { + "clip_ratio/high_max": 1.1250081115576904e-05, + "clip_ratio/high_mean": 3.5690324011738994e-06, + "clip_ratio/low_mean": 3.196108968950284e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.553012152224255e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15057.0, + "completions/mean_length": 7194.9296875, + "completions/mean_terminated_length": 6821.39013671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9744522422552109, + "epoch": 0.36798528058877644, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0032397822942584753, + "learning_rate": 1e-05, + "loss": 0.0402, + "num_tokens": 351252755.0, + "reward": 0.421875, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998766183853149, + "sampling/importance_sampling_ratio/min": 0.00023159870761446655, + "sampling/sampling_logp_difference/max": 8.370504379272461, + "sampling/sampling_logp_difference/mean": 0.02105094864964485, + "step": 400 + }, + { + "clip_ratio/high_max": 6.980455509619787e-06, + "clip_ratio/high_mean": 1.7451138774049468e-06, + "clip_ratio/low_mean": 2.2670621888210007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.441573599298863e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 6836.234375, + "completions/mean_terminated_length": 6607.08837890625, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.9149863049387932, + "epoch": 0.3689052437902484, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031576494220644236, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 352145873.0, + "reward": 0.3671875, + "reward_std": 0.22225630283355713, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266862869263, + "sampling/importance_sampling_ratio/min": 0.0011975533561781049, + "sampling/sampling_logp_difference/max": 6.727474689483643, + "sampling/sampling_logp_difference/mean": 0.020445333793759346, + "step": 401 + }, + { + "clip_ratio/high_max": 2.3557336589874467e-05, + "clip_ratio/high_mean": 5.889334147468617e-06, + "clip_ratio/low_mean": 5.359988131203863e-05, + "clip_ratio/low_min": 1.3856095392839052e-05, + "clip_ratio/region_mean": 5.9489215118446737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 6942.65625, + "completions/mean_terminated_length": 6638.0966796875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.7541583999991417, + "epoch": 0.36982520699172033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003970830701291561, + "learning_rate": 1e-05, + "loss": 0.051, + "num_tokens": 353056405.0, + "reward": 0.453125, + "reward_std": 0.3282659649848938, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 8.399576472584158e-06, + "sampling/sampling_logp_difference/max": 11.687329292297363, + "sampling/sampling_logp_difference/mean": 0.018101349472999573, + "step": 402 + }, + { + "clip_ratio/high_max": 2.6139805413549766e-05, + "clip_ratio/high_mean": 7.517377525800839e-06, + "clip_ratio/low_mean": 1.968103515537223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7198412681173068e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14786.0, + "completions/max_terminated_length": 14786.0, + "completions/mean_length": 6022.1875, + "completions/mean_terminated_length": 6022.1875, + "completions/min_length": 1285.0, + "completions/min_terminated_length": 1285.0, + "entropy": 0.9535745903849602, + "epoch": 0.37074517019319225, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0043656788766384125, + "learning_rate": 1e-05, + "loss": 0.029, + "num_tokens": 353844661.0, + "reward": 0.4140625, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.04981832951307297, + "sampling/sampling_logp_difference/max": 2.9993722438812256, + "sampling/sampling_logp_difference/mean": 0.020655371248722076, + "step": 403 + }, + { + "clip_ratio/high_max": 9.152076700047473e-06, + "clip_ratio/high_mean": 2.9508817647183605e-06, + "clip_ratio/low_mean": 5.21388310517068e-05, + "clip_ratio/low_min": 2.633131089169183e-06, + "clip_ratio/region_mean": 5.508971298695542e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15906.0, + "completions/mean_length": 8068.96875, + "completions/mean_terminated_length": 7869.408203125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.9473539590835571, + "epoch": 0.3716651333946642, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006543307099491358, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 354894689.0, + "reward": 0.2578125, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 6.672408926533535e-05, + "sampling/sampling_logp_difference/max": 9.614944458007812, + "sampling/sampling_logp_difference/mean": 0.021852033212780952, + "step": 404 + }, + { + "clip_ratio/high_max": 2.9619268843816826e-05, + "clip_ratio/high_mean": 7.4048172109542065e-06, + "clip_ratio/low_mean": 5.5152235972855124e-05, + "clip_ratio/low_min": 1.0455875781190116e-05, + "clip_ratio/region_mean": 6.255705375224352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15748.0, + "completions/mean_length": 5960.1875, + "completions/mean_terminated_length": 5878.1103515625, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 0.9564141109585762, + "epoch": 0.37258509659613614, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003351036459207535, + "learning_rate": 1e-05, + "loss": 0.0293, + "num_tokens": 355677273.0, + "reward": 0.46875, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999220371246338, + "sampling/importance_sampling_ratio/min": 0.0012859756825491786, + "sampling/sampling_logp_difference/max": 6.656237602233887, + "sampling/sampling_logp_difference/mean": 0.021779976785182953, + "step": 405 + }, + { + "clip_ratio/high_max": 7.957685966175632e-06, + "clip_ratio/high_mean": 1.989421491543908e-06, + "clip_ratio/low_mean": 3.758041248147492e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.956983414354909e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15669.0, + "completions/mean_length": 7620.21875, + "completions/mean_terminated_length": 7189.212890625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 1.035948596894741, + "epoch": 0.3735050597976081, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031219006050378084, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 356675829.0, + "reward": 0.296875, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001060962677002, + "sampling/importance_sampling_ratio/min": 0.010141897015273571, + "sampling/sampling_logp_difference/max": 4.591080188751221, + "sampling/sampling_logp_difference/mean": 0.021951109170913696, + "step": 406 + }, + { + "clip_ratio/high_max": 2.286768199155631e-05, + "clip_ratio/high_mean": 5.7169204978890775e-06, + "clip_ratio/low_mean": 3.914574369900947e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.486266482217616e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14038.0, + "completions/mean_length": 5806.0234375, + "completions/mean_terminated_length": 5638.119140625, + "completions/min_length": 1319.0, + "completions/min_terminated_length": 1319.0, + "entropy": 0.8977029845118523, + "epoch": 0.37442502299908004, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002810312667861581, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 357438712.0, + "reward": 0.546875, + "reward_std": 0.22832970321178436, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999280571937561, + "sampling/importance_sampling_ratio/min": 0.0011738575994968414, + "sampling/sampling_logp_difference/max": 6.747459888458252, + "sampling/sampling_logp_difference/mean": 0.01965375244617462, + "step": 407 + }, + { + "clip_ratio/high_max": 1.2219379641464911e-05, + "clip_ratio/high_mean": 3.054844910366228e-06, + "clip_ratio/low_mean": 3.186109779562685e-05, + "clip_ratio/low_min": 4.3511558942554984e-06, + "clip_ratio/region_mean": 3.4915943160740426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15705.0, + "completions/max_terminated_length": 15705.0, + "completions/mean_length": 6537.4609375, + "completions/mean_terminated_length": 6537.4609375, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.9577726796269417, + "epoch": 0.37534498620055196, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004516562446951866, + "learning_rate": 1e-05, + "loss": 0.0517, + "num_tokens": 358296731.0, + "reward": 0.3828125, + "reward_std": 0.1830746978521347, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999170303344727, + "sampling/importance_sampling_ratio/min": 2.384942035860149e-06, + "sampling/sampling_logp_difference/max": 12.946335792541504, + "sampling/sampling_logp_difference/mean": 0.021242395043373108, + "step": 408 + }, + { + "clip_ratio/high_max": 1.4422689218918094e-05, + "clip_ratio/high_mean": 3.6056723047295236e-06, + "clip_ratio/low_mean": 3.026239573955536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3868068385345396e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 7896.671875, + "completions/mean_terminated_length": 7622.88671875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.9163230583071709, + "epoch": 0.37626494940202393, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003542230697348714, + "learning_rate": 1e-05, + "loss": 0.05, + "num_tokens": 359327001.0, + "reward": 0.375, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998560547828674, + "sampling/importance_sampling_ratio/min": 0.00010891625424847007, + "sampling/sampling_logp_difference/max": 9.124931335449219, + "sampling/sampling_logp_difference/mean": 0.020085681229829788, + "step": 409 + }, + { + "clip_ratio/high_max": 1.7827243254942005e-05, + "clip_ratio/high_mean": 5.474494003010477e-06, + "clip_ratio/low_mean": 4.2465159026505717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.793965263161226e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15297.0, + "completions/mean_length": 6728.7109375, + "completions/mean_terminated_length": 6652.68505859375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9010183215141296, + "epoch": 0.37718491260349585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0035069347359240055, + "learning_rate": 1e-05, + "loss": 0.0518, + "num_tokens": 360208780.0, + "reward": 0.5390625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999571442604065, + "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05, + "sampling/sampling_logp_difference/max": 11.124998092651367, + "sampling/sampling_logp_difference/mean": 0.021022530272603035, + "step": 410 + }, + { + "clip_ratio/high_max": 1.0376989393989788e-05, + "clip_ratio/high_mean": 2.594247348497447e-06, + "clip_ratio/low_mean": 2.8587513156708155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1181759936771414e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6800.3984375, + "completions/mean_terminated_length": 6491.25, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.8654960840940475, + "epoch": 0.3781048758049678, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033910400234162807, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 361098567.0, + "reward": 0.5625, + "reward_std": 0.2306838035583496, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998576641082764, + "sampling/importance_sampling_ratio/min": 0.001449413481168449, + "sampling/sampling_logp_difference/max": 6.536596298217773, + "sampling/sampling_logp_difference/mean": 0.019660964608192444, + "step": 411 + }, + { + "clip_ratio/high_max": 2.3068858354236e-05, + "clip_ratio/high_mean": 7.792090059410839e-06, + "clip_ratio/low_mean": 5.8515578757578623e-05, + "clip_ratio/low_min": 1.0348648629587842e-05, + "clip_ratio/region_mean": 6.630766870330262e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7103.4453125, + "completions/mean_terminated_length": 6956.13525390625, + "completions/min_length": 1711.0, + "completions/min_terminated_length": 1711.0, + "entropy": 0.8317076042294502, + "epoch": 0.37902483900643974, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036110079381614923, + "learning_rate": 1e-05, + "loss": 0.0834, + "num_tokens": 362027520.0, + "reward": 0.546875, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999338984489441, + "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05, + "sampling/sampling_logp_difference/max": 11.458046913146973, + "sampling/sampling_logp_difference/mean": 0.01939362846314907, + "step": 412 + }, + { + "clip_ratio/high_max": 3.112394779236638e-06, + "clip_ratio/high_mean": 7.780986948091595e-07, + "clip_ratio/low_mean": 5.127149995587388e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.204959859383962e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15830.0, + "completions/mean_length": 7344.9296875, + "completions/mean_terminated_length": 6900.384765625, + "completions/min_length": 1368.0, + "completions/min_terminated_length": 1368.0, + "entropy": 0.8387318029999733, + "epoch": 0.37994480220791166, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002141098491847515, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 362985207.0, + "reward": 0.34375, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322891235352, + "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05, + "sampling/sampling_logp_difference/max": 10.874617576599121, + "sampling/sampling_logp_difference/mean": 0.01929464004933834, + "step": 413 + }, + { + "clip_ratio/high_max": 5.2602786126954015e-06, + "clip_ratio/high_mean": 1.3150696531738504e-06, + "clip_ratio/low_mean": 1.7854434247510653e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9169503786997666e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16137.0, + "completions/mean_length": 6377.7734375, + "completions/mean_terminated_length": 6218.94482421875, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9732858911156654, + "epoch": 0.38086476540938363, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015244127716869116, + "learning_rate": 1e-05, + "loss": 0.0608, + "num_tokens": 363823914.0, + "reward": 0.4375, + "reward_std": 0.1988610327243805, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 0.006335465237498283, + "sampling/sampling_logp_difference/max": 5.061592102050781, + "sampling/sampling_logp_difference/mean": 0.020688029006123543, + "step": 414 + }, + { + "clip_ratio/high_max": 2.6195500595349586e-05, + "clip_ratio/high_mean": 6.548875148837396e-06, + "clip_ratio/low_mean": 3.3802934012783226e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035180882056011e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14456.0, + "completions/mean_length": 5599.7890625, + "completions/mean_terminated_length": 5340.96826171875, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8872368410229683, + "epoch": 0.38178472861085555, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002647512126713991, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 364561127.0, + "reward": 0.453125, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999077916145325, + "sampling/importance_sampling_ratio/min": 2.370526999584399e-06, + "sampling/sampling_logp_difference/max": 12.952398300170898, + "sampling/sampling_logp_difference/mean": 0.01878243312239647, + "step": 415 + }, + { + "clip_ratio/high_max": 2.157278959202813e-05, + "clip_ratio/high_mean": 5.3931973980070325e-06, + "clip_ratio/low_mean": 7.215861739950924e-05, + "clip_ratio/low_min": 1.4898997051204788e-05, + "clip_ratio/region_mean": 7.755181559332414e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15905.0, + "completions/mean_length": 7877.2890625, + "completions/mean_terminated_length": 7385.1650390625, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.8416353687644005, + "epoch": 0.3827046918123275, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018051012884825468, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 365590124.0, + "reward": 0.3125, + "reward_std": 0.28407180309295654, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.0004095165350008756, + "sampling/sampling_logp_difference/max": 7.800533294677734, + "sampling/sampling_logp_difference/mean": 0.019809434190392494, + "step": 416 + }, + { + "clip_ratio/high_max": 2.540994637456606e-05, + "clip_ratio/high_mean": 6.352486593641515e-06, + "clip_ratio/low_mean": 4.230594890941575e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8658435844117776e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16083.0, + "completions/mean_length": 6836.7890625, + "completions/mean_terminated_length": 6200.30859375, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.8647575601935387, + "epoch": 0.38362465501379944, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004550795070827007, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 366486337.0, + "reward": 0.40625, + "reward_std": 0.22620806097984314, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873638153076, + "sampling/importance_sampling_ratio/min": 0.0001089095021598041, + "sampling/sampling_logp_difference/max": 9.124993324279785, + "sampling/sampling_logp_difference/mean": 0.01992485672235489, + "step": 417 + }, + { + "clip_ratio/high_max": 1.1592664577619871e-05, + "clip_ratio/high_mean": 2.8981661444049678e-06, + "clip_ratio/low_mean": 3.5717548257707676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.861571451579948e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16286.0, + "completions/mean_length": 6884.953125, + "completions/mean_terminated_length": 6417.78662109375, + "completions/min_length": 1289.0, + "completions/min_terminated_length": 1289.0, + "entropy": 0.8691708743572235, + "epoch": 0.3845446182152714, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005958946421742439, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 367386163.0, + "reward": 0.5078125, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 9.519772902422119e-06, + "sampling/sampling_logp_difference/max": 11.562139511108398, + "sampling/sampling_logp_difference/mean": 0.019436441361904144, + "step": 418 + }, + { + "clip_ratio/high_max": 2.7658640192385064e-05, + "clip_ratio/high_mean": 8.455849524580117e-06, + "clip_ratio/low_mean": 3.938097847822064e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7836828116487595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15574.0, + "completions/mean_length": 7439.1328125, + "completions/mean_terminated_length": 7150.58837890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.795464999973774, + "epoch": 0.38546458141674333, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00558120384812355, + "learning_rate": 1e-05, + "loss": 0.1918, + "num_tokens": 368357500.0, + "reward": 0.609375, + "reward_std": 0.3795146346092224, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.0001159337698481977, + "sampling/sampling_logp_difference/max": 9.062491416931152, + "sampling/sampling_logp_difference/mean": 0.018824251368641853, + "step": 419 + }, + { + "clip_ratio/high_max": 8.509555527780321e-06, + "clip_ratio/high_mean": 2.1273888819450804e-06, + "clip_ratio/low_mean": 3.0958593640662e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.308598269313734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16236.0, + "completions/mean_length": 6751.53125, + "completions/mean_terminated_length": 6520.3525390625, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 0.9450879693031311, + "epoch": 0.38638454461821525, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004628168884664774, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 369242920.0, + "reward": 0.359375, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.0006074689445085824, + "sampling/sampling_logp_difference/max": 7.406209468841553, + "sampling/sampling_logp_difference/mean": 0.019376013427972794, + "step": 420 + }, + { + "clip_ratio/high_max": 1.8288420505996328e-05, + "clip_ratio/high_mean": 4.572105126499082e-06, + "clip_ratio/low_mean": 4.86290555272717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320115997164976e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16164.0, + "completions/mean_length": 7023.296875, + "completions/mean_terminated_length": 6315.3447265625, + "completions/min_length": 1628.0, + "completions/min_terminated_length": 1628.0, + "entropy": 0.7378111630678177, + "epoch": 0.3873045078196872, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00389425759203732, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 370159510.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999127388000488, + "sampling/importance_sampling_ratio/min": 0.00014012664905749261, + "sampling/sampling_logp_difference/max": 8.872963905334473, + "sampling/sampling_logp_difference/mean": 0.016914553940296173, + "step": 421 + }, + { + "clip_ratio/high_max": 2.1269573153404053e-05, + "clip_ratio/high_mean": 5.948400371380558e-06, + "clip_ratio/low_mean": 2.3538930747690756e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9487331687505502e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16018.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 7702.3046875, + "completions/mean_terminated_length": 7702.3046875, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.9053447172045708, + "epoch": 0.38822447102115915, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004324545152485371, + "learning_rate": 1e-05, + "loss": 0.0149, + "num_tokens": 371162773.0, + "reward": 0.2421875, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00001060962677, + "sampling/importance_sampling_ratio/min": 2.283278627146501e-05, + "sampling/sampling_logp_difference/max": 10.687313079833984, + "sampling/sampling_logp_difference/mean": 0.020495830103754997, + "step": 422 + }, + { + "clip_ratio/high_max": 1.0294916819475475e-05, + "clip_ratio/high_mean": 2.5737292048688687e-06, + "clip_ratio/low_mean": 5.831611520079605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.088984559937671e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 6904.78125, + "completions/mean_terminated_length": 6754.31787109375, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.7991176024079323, + "epoch": 0.3891444342226311, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003239463549107313, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 372067241.0, + "reward": 0.328125, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00012340991816017777, + "sampling/sampling_logp_difference/max": 8.999999046325684, + "sampling/sampling_logp_difference/mean": 0.019042208790779114, + "step": 423 + }, + { + "clip_ratio/high_max": 2.7261318791715894e-05, + "clip_ratio/high_mean": 7.926559305815317e-06, + "clip_ratio/low_mean": 1.552133551285806e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3447895273420727e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15399.0, + "completions/mean_length": 6107.7421875, + "completions/mean_terminated_length": 5602.35205078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.9495253190398216, + "epoch": 0.39006439742410304, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015464330790564418, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 372866072.0, + "reward": 0.421875, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971330165863, + "sampling/importance_sampling_ratio/min": 0.00024684349773451686, + "sampling/sampling_logp_difference/max": 8.306756019592285, + "sampling/sampling_logp_difference/mean": 0.019793221727013588, + "step": 424 + }, + { + "clip_ratio/high_max": 2.457227401464479e-05, + "clip_ratio/high_mean": 8.533324717063806e-06, + "clip_ratio/low_mean": 3.261690835643094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.115023284612107e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15939.0, + "completions/mean_length": 6079.8046875, + "completions/mean_terminated_length": 5747.4111328125, + "completions/min_length": 1082.0, + "completions/min_terminated_length": 1082.0, + "entropy": 0.8005363270640373, + "epoch": 0.39098436062557496, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024811832699924707, + "learning_rate": 1e-05, + "loss": 0.1124, + "num_tokens": 373663463.0, + "reward": 0.625, + "reward_std": 0.2630355656147003, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743103981018, + "sampling/importance_sampling_ratio/min": 0.00019348970090504736, + "sampling/sampling_logp_difference/max": 8.550286293029785, + "sampling/sampling_logp_difference/mean": 0.017151469364762306, + "step": 425 + }, + { + "clip_ratio/high_max": 3.3719989005476236e-06, + "clip_ratio/high_mean": 8.429997251369059e-07, + "clip_ratio/low_mean": 2.132218082806503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2165180553201935e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14925.0, + "completions/mean_length": 6453.7890625, + "completions/mean_terminated_length": 6375.5986328125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "entropy": 0.9212624430656433, + "epoch": 0.39190432382704693, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031475063879042864, + "learning_rate": 1e-05, + "loss": 0.0959, + "num_tokens": 374517492.0, + "reward": 0.34375, + "reward_std": 0.19910329580307007, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999594688415527, + "sampling/importance_sampling_ratio/min": 0.015664709731936455, + "sampling/sampling_logp_difference/max": 4.156344890594482, + "sampling/sampling_logp_difference/mean": 0.019899867475032806, + "step": 426 + }, + { + "clip_ratio/high_max": 1.907509408738406e-05, + "clip_ratio/high_mean": 5.984868664654641e-06, + "clip_ratio/low_mean": 3.784128080042137e-05, + "clip_ratio/low_min": 3.7751804029539926e-06, + "clip_ratio/region_mean": 4.382614952191943e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16159.0, + "completions/max_terminated_length": 16159.0, + "completions/mean_length": 6126.9921875, + "completions/mean_terminated_length": 6126.9921875, + "completions/min_length": 1106.0, + "completions/min_terminated_length": 1106.0, + "entropy": 0.8252849578857422, + "epoch": 0.39282428702851885, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004200868774205446, + "learning_rate": 1e-05, + "loss": 0.0276, + "num_tokens": 375320339.0, + "reward": 0.4140625, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999815225601196, + "sampling/importance_sampling_ratio/min": 0.005763276945799589, + "sampling/sampling_logp_difference/max": 5.156249046325684, + "sampling/sampling_logp_difference/mean": 0.01833093911409378, + "step": 427 + }, + { + "clip_ratio/high_max": 1.8918785372079583e-05, + "clip_ratio/high_mean": 5.476571459439583e-06, + "clip_ratio/low_mean": 6.169724406390742e-05, + "clip_ratio/low_min": 7.494657666029525e-06, + "clip_ratio/region_mean": 6.717381506859965e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15411.0, + "completions/mean_length": 6739.09375, + "completions/mean_terminated_length": 6427.9677734375, + "completions/min_length": 1228.0, + "completions/min_terminated_length": 1228.0, + "entropy": 0.8008574098348618, + "epoch": 0.3937442502299908, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003204014617949724, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 376201015.0, + "reward": 0.5390625, + "reward_std": 0.37086254358291626, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998303651809692, + "sampling/importance_sampling_ratio/min": 0.00010144581028725952, + "sampling/sampling_logp_difference/max": 9.195985794067383, + "sampling/sampling_logp_difference/mean": 0.018961725756525993, + "step": 428 + }, + { + "clip_ratio/high_max": 1.3558789078160771e-05, + "clip_ratio/high_mean": 3.389697269540193e-06, + "clip_ratio/low_mean": 5.3925050679026754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.731474743697618e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15634.0, + "completions/mean_length": 7245.8984375, + "completions/mean_terminated_length": 6951.12060546875, + "completions/min_length": 1306.0, + "completions/min_terminated_length": 1306.0, + "entropy": 1.0351596996188164, + "epoch": 0.39466421343146274, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0039763906970620155, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 377149650.0, + "reward": 0.375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000600814819336, + "sampling/importance_sampling_ratio/min": 8.106228051474318e-05, + "sampling/sampling_logp_difference/max": 9.420292854309082, + "sampling/sampling_logp_difference/mean": 0.020948028191924095, + "step": 429 + }, + { + "clip_ratio/high_max": 1.4580486549675697e-05, + "clip_ratio/high_mean": 4.259903903403028e-06, + "clip_ratio/low_mean": 4.6149686397711775e-05, + "clip_ratio/low_min": 3.006686938533676e-06, + "clip_ratio/region_mean": 5.04095905853319e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 6958.625, + "completions/mean_terminated_length": 6495.08154296875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.8360240310430527, + "epoch": 0.39558417663293466, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0031417158897966146, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 378057802.0, + "reward": 0.515625, + "reward_std": 0.35771697759628296, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999384880065918, + "sampling/importance_sampling_ratio/min": 0.00010235882655251771, + "sampling/sampling_logp_difference/max": 9.187026023864746, + "sampling/sampling_logp_difference/mean": 0.019185224547982216, + "step": 430 + }, + { + "clip_ratio/high_max": 6.681633749394678e-06, + "clip_ratio/high_mean": 1.6704084373486694e-06, + "clip_ratio/low_mean": 5.096616632727091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.263657521936693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15410.0, + "completions/max_terminated_length": 15410.0, + "completions/mean_length": 5696.3984375, + "completions/mean_terminated_length": 5696.3984375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.7887749597430229, + "epoch": 0.39650413983440663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004943124484270811, + "learning_rate": 1e-05, + "loss": 0.096, + "num_tokens": 378808021.0, + "reward": 0.515625, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999057054519653, + "sampling/importance_sampling_ratio/min": 0.0015042300801724195, + "sampling/sampling_logp_difference/max": 6.499474048614502, + "sampling/sampling_logp_difference/mean": 0.018845941871404648, + "step": 431 + }, + { + "clip_ratio/high_max": 1.7526824194646906e-05, + "clip_ratio/high_mean": 5.417880970526312e-06, + "clip_ratio/low_mean": 3.513921649300755e-05, + "clip_ratio/low_min": 6.075038982089609e-06, + "clip_ratio/region_mean": 4.0557096895099676e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14233.0, + "completions/mean_length": 6480.8828125, + "completions/mean_terminated_length": 6323.69091796875, + "completions/min_length": 1013.0, + "completions/min_terminated_length": 1013.0, + "entropy": 0.8796411231160164, + "epoch": 0.39742410303587855, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00595651101320982, + "learning_rate": 1e-05, + "loss": 0.0546, + "num_tokens": 379659710.0, + "reward": 0.3984375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 0.0017907419241964817, + "sampling/sampling_logp_difference/max": 6.325125217437744, + "sampling/sampling_logp_difference/mean": 0.01906527951359749, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4512424602107785e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4512424602107785e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7501.703125, + "completions/mean_terminated_length": 6829.93310546875, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "entropy": 0.786028303205967, + "epoch": 0.3983440662373505, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0024527597706764936, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 380640720.0, + "reward": 0.5234375, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999595880508423, + "sampling/importance_sampling_ratio/min": 8.851602615322918e-07, + "sampling/sampling_logp_difference/max": 13.93749713897705, + "sampling/sampling_logp_difference/mean": 0.01873261108994484, + "step": 433 + }, + { + "clip_ratio/high_max": 1.4606259583160863e-05, + "clip_ratio/high_mean": 5.505394312876888e-06, + "clip_ratio/low_mean": 3.1679782978244475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7185177234277944e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15185.0, + "completions/mean_length": 5619.2890625, + "completions/mean_terminated_length": 5448.4208984375, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.8098893761634827, + "epoch": 0.39926402943882244, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004280989523977041, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 381377981.0, + "reward": 0.609375, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.0010248658945783973, + "sampling/sampling_logp_difference/max": 6.883193492889404, + "sampling/sampling_logp_difference/mean": 0.017923470586538315, + "step": 434 + }, + { + "clip_ratio/high_max": 1.4808703554081148e-05, + "clip_ratio/high_mean": 3.702175888520287e-06, + "clip_ratio/low_mean": 2.3637440563106793e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7339616224253405e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5243.8203125, + "completions/mean_terminated_length": 5156.1025390625, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "entropy": 0.7485036551952362, + "epoch": 0.40018399264029436, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004721642471849918, + "learning_rate": 1e-05, + "loss": 0.0877, + "num_tokens": 382070478.0, + "reward": 0.6875, + "reward_std": 0.26538965106010437, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999414086341858, + "sampling/importance_sampling_ratio/min": 0.0011518355458974838, + "sampling/sampling_logp_difference/max": 6.7663984298706055, + "sampling/sampling_logp_difference/mean": 0.016579966992139816, + "step": 435 + }, + { + "clip_ratio/high_max": 3.1177480195765384e-05, + "clip_ratio/high_mean": 1.1174359769938746e-05, + "clip_ratio/low_mean": 3.602651599976525e-05, + "clip_ratio/low_min": 4.348733455117326e-06, + "clip_ratio/region_mean": 4.720087713394605e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15978.0, + "completions/mean_length": 7021.1796875, + "completions/mean_terminated_length": 6872.56396484375, + "completions/min_length": 1371.0, + "completions/min_terminated_length": 1371.0, + "entropy": 0.8693460151553154, + "epoch": 0.40110395584176634, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00329192029312253, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 382990245.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.0023386883549392223, + "sampling/sampling_logp_difference/max": 6.058165073394775, + "sampling/sampling_logp_difference/mean": 0.019863136112689972, + "step": 436 + }, + { + "clip_ratio/high_max": 1.1192694955752813e-05, + "clip_ratio/high_mean": 2.7981737389382033e-06, + "clip_ratio/low_mean": 4.9078003257818636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.1876177280973934e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15344.0, + "completions/mean_length": 6917.625, + "completions/mean_terminated_length": 6452.0654296875, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "entropy": 0.8466897681355476, + "epoch": 0.40202391904323825, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0051889242604374886, + "learning_rate": 1e-05, + "loss": 0.1009, + "num_tokens": 383896717.0, + "reward": 0.4140625, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999983310699463, + "sampling/importance_sampling_ratio/min": 0.00015846389578655362, + "sampling/sampling_logp_difference/max": 8.749983787536621, + "sampling/sampling_logp_difference/mean": 0.019528398290276527, + "step": 437 + }, + { + "clip_ratio/high_max": 2.3224948108691024e-05, + "clip_ratio/high_mean": 8.263948757303297e-06, + "clip_ratio/low_mean": 3.8556312347282073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.682026019509067e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16175.0, + "completions/mean_length": 7487.5078125, + "completions/mean_terminated_length": 7346.2939453125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.9584660083055496, + "epoch": 0.4029438822447102, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002855573548004031, + "learning_rate": 1e-05, + "loss": 0.0087, + "num_tokens": 384872622.0, + "reward": 0.3828125, + "reward_std": 0.2477683424949646, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999386668205261, + "sampling/importance_sampling_ratio/min": 0.0038593418430536985, + "sampling/sampling_logp_difference/max": 5.557258605957031, + "sampling/sampling_logp_difference/mean": 0.0209865253418684, + "step": 438 + }, + { + "clip_ratio/high_max": 6.171620498207631e-06, + "clip_ratio/high_mean": 1.5429051245519076e-06, + "clip_ratio/low_mean": 2.98128834401723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.135578845103737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16092.0, + "completions/mean_length": 6637.5078125, + "completions/mean_terminated_length": 6323.1044921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 0.8841215297579765, + "epoch": 0.40386384544618215, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004437311552464962, + "learning_rate": 1e-05, + "loss": 0.0523, + "num_tokens": 385744023.0, + "reward": 0.3984375, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999136924743652, + "sampling/importance_sampling_ratio/min": 0.002925124252215028, + "sampling/sampling_logp_difference/max": 5.834418296813965, + "sampling/sampling_logp_difference/mean": 0.019490888342261314, + "step": 439 + }, + { + "clip_ratio/high_max": 1.3304874300956726e-05, + "clip_ratio/high_mean": 3.3262185752391815e-06, + "clip_ratio/low_mean": 5.443932013804442e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.776553894065728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15143.0, + "completions/mean_length": 5965.9765625, + "completions/mean_terminated_length": 5800.611328125, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "entropy": 0.8726934269070625, + "epoch": 0.4047838086476541, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002463799435645342, + "learning_rate": 1e-05, + "loss": -0.0075, + "num_tokens": 386525492.0, + "reward": 0.3984375, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.00020367901015561074, + "sampling/sampling_logp_difference/max": 8.4989652633667, + "sampling/sampling_logp_difference/mean": 0.01946769654750824, + "step": 440 + }, + { + "clip_ratio/high_max": 1.0084711902891286e-05, + "clip_ratio/high_mean": 3.6154040117253317e-06, + "clip_ratio/low_mean": 3.598771945689805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9603123695997056e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6693.109375, + "completions/mean_terminated_length": 6616.80322265625, + "completions/min_length": 1704.0, + "completions/min_terminated_length": 1704.0, + "entropy": 0.9430640190839767, + "epoch": 0.40570377184912604, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038990566972643137, + "learning_rate": 1e-05, + "loss": 0.0415, + "num_tokens": 387404842.0, + "reward": 0.421875, + "reward_std": 0.31587693095207214, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999700784683228, + "sampling/importance_sampling_ratio/min": 0.0011708902893587947, + "sampling/sampling_logp_difference/max": 6.749990940093994, + "sampling/sampling_logp_difference/mean": 0.020848294720053673, + "step": 441 + }, + { + "clip_ratio/high_max": 7.462686426151777e-06, + "clip_ratio/high_mean": 1.8656716065379442e-06, + "clip_ratio/low_mean": 5.234285907818048e-05, + "clip_ratio/low_min": 4.47803950009984e-06, + "clip_ratio/region_mean": 5.420853057103159e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7045.6953125, + "completions/mean_terminated_length": 6505.46240234375, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "entropy": 0.8912066072225571, + "epoch": 0.40662373505059796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018510994268581271, + "learning_rate": 1e-05, + "loss": 0.099, + "num_tokens": 388324475.0, + "reward": 0.40625, + "reward_std": 0.32195523381233215, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999024868011475, + "sampling/importance_sampling_ratio/min": 0.0031757301185280085, + "sampling/sampling_logp_difference/max": 5.752217769622803, + "sampling/sampling_logp_difference/mean": 0.020547039806842804, + "step": 442 + }, + { + "clip_ratio/high_max": 2.504527083146968e-05, + "clip_ratio/high_mean": 6.26131770786742e-06, + "clip_ratio/low_mean": 6.165269871871715e-05, + "clip_ratio/low_min": 3.5272871627967106e-06, + "clip_ratio/region_mean": 6.791401551708987e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15734.0, + "completions/mean_length": 7480.0078125, + "completions/mean_terminated_length": 7266.3125, + "completions/min_length": 1130.0, + "completions/min_terminated_length": 1130.0, + "entropy": 0.8813760280609131, + "epoch": 0.40754369825206993, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004439481534063816, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 389305644.0, + "reward": 0.34375, + "reward_std": 0.31300368905067444, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999762773513794, + "sampling/importance_sampling_ratio/min": 0.007449973840266466, + "sampling/sampling_logp_difference/max": 4.899544715881348, + "sampling/sampling_logp_difference/mean": 0.01973455585539341, + "step": 443 + }, + { + "clip_ratio/high_max": 4.0980917219712865e-06, + "clip_ratio/high_mean": 1.0245229304928216e-06, + "clip_ratio/low_mean": 3.662567087303614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.76501939172158e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15302.0, + "completions/max_terminated_length": 15302.0, + "completions/mean_length": 7044.4453125, + "completions/mean_terminated_length": 7044.4453125, + "completions/min_length": 1229.0, + "completions/min_terminated_length": 1229.0, + "entropy": 0.9901906549930573, + "epoch": 0.40846366145354185, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004181519150733948, + "learning_rate": 1e-05, + "loss": -0.0068, + "num_tokens": 390229373.0, + "reward": 0.421875, + "reward_std": 0.17700131237506866, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000314712524414, + "sampling/importance_sampling_ratio/min": 0.00022536676260642707, + "sampling/sampling_logp_difference/max": 8.397781372070312, + "sampling/sampling_logp_difference/mean": 0.021211043000221252, + "step": 444 + }, + { + "clip_ratio/high_max": 1.4909872106727562e-05, + "clip_ratio/high_mean": 3.7274680266818905e-06, + "clip_ratio/low_mean": 5.29995777469594e-05, + "clip_ratio/low_min": 3.708758640641463e-06, + "clip_ratio/region_mean": 5.672704537573736e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7815.8125, + "completions/mean_terminated_length": 7244.6005859375, + "completions/min_length": 1350.0, + "completions/min_terminated_length": 1350.0, + "entropy": 0.8278292864561081, + "epoch": 0.4093836246550138, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002691390924155712, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 391251141.0, + "reward": 0.3515625, + "reward_std": 0.31222954392433167, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 0.007715471088886261, + "sampling/sampling_logp_difference/max": 4.864527702331543, + "sampling/sampling_logp_difference/mean": 0.018415704369544983, + "step": 445 + }, + { + "clip_ratio/high_max": 2.1858722902834415e-05, + "clip_ratio/high_mean": 6.629899417021079e-06, + "clip_ratio/low_mean": 3.196247394043894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.859237290271267e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15202.0, + "completions/mean_length": 5305.1796875, + "completions/mean_terminated_length": 5217.94482421875, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.8100772425532341, + "epoch": 0.41030358785648574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0069543467834591866, + "learning_rate": 1e-05, + "loss": 0.1153, + "num_tokens": 391956196.0, + "reward": 0.609375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000190734863281, + "sampling/importance_sampling_ratio/min": 0.0024869756307452917, + "sampling/sampling_logp_difference/max": 5.996687889099121, + "sampling/sampling_logp_difference/mean": 0.017318082973361015, + "step": 446 + }, + { + "clip_ratio/high_max": 2.461934036546154e-05, + "clip_ratio/high_mean": 8.056288947955181e-06, + "clip_ratio/low_mean": 5.289376917971822e-05, + "clip_ratio/low_min": 4.21926688431995e-06, + "clip_ratio/region_mean": 6.0950058468733914e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15300.0, + "completions/mean_length": 7299.578125, + "completions/mean_terminated_length": 6930.29248046875, + "completions/min_length": 1008.0, + "completions/min_terminated_length": 1008.0, + "entropy": 0.9955824315547943, + "epoch": 0.41122355105795766, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0065611582249403, + "learning_rate": 1e-05, + "loss": 0.0883, + "num_tokens": 392908430.0, + "reward": 0.4375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999696016311646, + "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06, + "sampling/sampling_logp_difference/max": 11.873339653015137, + "sampling/sampling_logp_difference/mean": 0.02127375639975071, + "step": 447 + }, + { + "clip_ratio/high_max": 2.4339562514796853e-05, + "clip_ratio/high_mean": 7.412756531266496e-06, + "clip_ratio/low_mean": 3.89272447591793e-05, + "clip_ratio/low_min": 4.047796210215893e-06, + "clip_ratio/region_mean": 4.6340001517819474e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 6702.9375, + "completions/mean_terminated_length": 6390.64501953125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.82919991761446, + "epoch": 0.41214351425942963, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032975098583847284, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 393788286.0, + "reward": 0.4609375, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.00028582560480572283, + "sampling/sampling_logp_difference/max": 8.160128593444824, + "sampling/sampling_logp_difference/mean": 0.019461583346128464, + "step": 448 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 393788286, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-448/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-448/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-448/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/README.md b/dapo_milora_plus_20251201_131939/checkpoint-512/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-512/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-512/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-512/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/latest b/dapo_milora_plus_20251201_131939/checkpoint-512/latest new file mode 100644 index 0000000000000000000000000000000000000000..35f851ced1a2a2007c68236a52dfc57e513ef909 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-512/latest @@ -0,0 +1 @@ +global_step512 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-512/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-512/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-512/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-512/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-512/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a09d2337ba5ef356f2482abac5ccca6256e7b984 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-512/trainer_state.json @@ -0,0 +1,15906 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.47102115915363385, + "eval_steps": 500, + "global_step": 512, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + }, + { + "clip_ratio/high_max": 6.87833608026267e-06, + "clip_ratio/high_mean": 2.9462287329806713e-06, + "clip_ratio/low_mean": 5.435333650893881e-05, + "clip_ratio/low_min": 5.33937054569833e-06, + "clip_ratio/region_mean": 5.729956546929316e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 6448.0078125, + "completions/mean_terminated_length": 6369.771484375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9546648040413857, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004310046322643757, + "learning_rate": 1e-05, + "loss": 0.1082, + "num_tokens": 220304605.0, + "reward": 0.5703125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 0.0001234127557836473, + "sampling/sampling_logp_difference/max": 8.99997615814209, + "sampling/sampling_logp_difference/mean": 0.020253397524356842, + "step": 257 + }, + { + "clip_ratio/high_max": 6.196094091137638e-06, + "clip_ratio/high_mean": 1.5490235227844096e-06, + "clip_ratio/low_mean": 2.5416685957679874e-05, + "clip_ratio/low_min": 5.5736391004757024e-06, + "clip_ratio/region_mean": 2.696570959415112e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 7457.6484375, + "completions/mean_terminated_length": 6941.24755859375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "entropy": 0.8182889074087143, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026646999176591635, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 221281968.0, + "reward": 0.4453125, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173283576965, + "sampling/importance_sampling_ratio/min": 2.902353571698768e-06, + "sampling/sampling_logp_difference/max": 12.749988555908203, + "sampling/sampling_logp_difference/mean": 0.019208962097764015, + "step": 258 + }, + { + "clip_ratio/high_max": 1.6189535017474554e-05, + "clip_ratio/high_mean": 4.047383754368639e-06, + "clip_ratio/low_mean": 3.127787306311802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.532525670379982e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8561.109375, + "completions/mean_terminated_length": 7969.79052734375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.9581378549337387, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016026750672608614, + "learning_rate": 1e-05, + "loss": 0.0131, + "num_tokens": 222399046.0, + "reward": 0.34375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 1.653693971093162e-06, + "sampling/sampling_logp_difference/max": 13.312499046325684, + "sampling/sampling_logp_difference/mean": 0.02173236384987831, + "step": 259 + }, + { + "clip_ratio/high_max": 1.4200771602190798e-05, + "clip_ratio/high_mean": 4.3255887476334465e-06, + "clip_ratio/low_mean": 5.2955770115659107e-05, + "clip_ratio/low_min": 3.402656830076012e-06, + "clip_ratio/region_mean": 5.7281358749605715e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16239.0, + "completions/mean_length": 7152.34375, + "completions/mean_terminated_length": 7079.6533203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9052041247487068, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005460259038954973, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 223335010.0, + "reward": 0.4296875, + "reward_std": 0.3356297016143799, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966621398926, + "sampling/importance_sampling_ratio/min": 0.010161337442696095, + "sampling/sampling_logp_difference/max": 4.589165210723877, + "sampling/sampling_logp_difference/mean": 0.01986619457602501, + "step": 260 + }, + { + "clip_ratio/high_max": 1.4350314813782461e-05, + "clip_ratio/high_mean": 3.5875787034456152e-06, + "clip_ratio/low_mean": 3.81288905373367e-05, + "clip_ratio/low_min": 8.099272235995159e-06, + "clip_ratio/region_mean": 4.1716469809216505e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 6678.65625, + "completions/mean_terminated_length": 6524.603515625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.9043187350034714, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005933742038905621, + "learning_rate": 1e-05, + "loss": 0.0966, + "num_tokens": 224207006.0, + "reward": 0.484375, + "reward_std": 0.3316681981086731, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000031590461731, + "sampling/importance_sampling_ratio/min": 0.0011734943836927414, + "sampling/sampling_logp_difference/max": 6.747769355773926, + "sampling/sampling_logp_difference/mean": 0.019827336072921753, + "step": 261 + }, + { + "clip_ratio/high_max": 1.6498819377375185e-05, + "clip_ratio/high_mean": 4.124704844343796e-06, + "clip_ratio/low_mean": 3.601791678420341e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014262168539062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6999.0390625, + "completions/mean_terminated_length": 6850.07177734375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8109970837831497, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003635740838944912, + "learning_rate": 1e-05, + "loss": 0.104, + "num_tokens": 225122891.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303817749023, + "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05, + "sampling/sampling_logp_difference/max": 10.987512588500977, + "sampling/sampling_logp_difference/mean": 0.018912551924586296, + "step": 262 + }, + { + "clip_ratio/high_max": 9.527577958579059e-06, + "clip_ratio/high_mean": 2.3818944896447647e-06, + "clip_ratio/low_mean": 3.766565987461945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004755419373396e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7483.7109375, + "completions/mean_terminated_length": 7045.9912109375, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "entropy": 0.9473970532417297, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003405241761356592, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 226102462.0, + "reward": 0.4453125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002920627594, + "sampling/importance_sampling_ratio/min": 0.00525119062513113, + "sampling/sampling_logp_difference/max": 5.249300479888916, + "sampling/sampling_logp_difference/mean": 0.021076779812574387, + "step": 263 + }, + { + "clip_ratio/high_max": 1.5867321963014547e-05, + "clip_ratio/high_mean": 3.966830490753637e-06, + "clip_ratio/low_mean": 3.8259706570897833e-05, + "clip_ratio/low_min": 3.549019083948224e-06, + "clip_ratio/region_mean": 4.2226537743772496e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 7569.03125, + "completions/mean_terminated_length": 7357.47216796875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9231455475091934, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025927501264959574, + "learning_rate": 1e-05, + "loss": 0.0801, + "num_tokens": 227093562.0, + "reward": 0.3984375, + "reward_std": 0.19097033143043518, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0052477638237178326, + "sampling/sampling_logp_difference/max": 5.249953269958496, + "sampling/sampling_logp_difference/mean": 0.020578444004058838, + "step": 264 + }, + { + "clip_ratio/high_max": 1.344091060673236e-05, + "clip_ratio/high_mean": 3.36022765168309e-06, + "clip_ratio/low_mean": 4.253613235505327e-05, + "clip_ratio/low_min": 3.5579084851633525e-06, + "clip_ratio/region_mean": 4.5896360120423196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 7589.2734375, + "completions/mean_terminated_length": 7378.2001953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9265239909291267, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030512227676808834, + "learning_rate": 1e-05, + "loss": 0.04, + "num_tokens": 228086405.0, + "reward": 0.4296875, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0002165911573683843, + "sampling/sampling_logp_difference/max": 8.437499046325684, + "sampling/sampling_logp_difference/mean": 0.020208362489938736, + "step": 265 + }, + { + "clip_ratio/high_max": 1.9613525410022703e-05, + "clip_ratio/high_mean": 4.903381352505676e-06, + "clip_ratio/low_mean": 3.184792547017423e-05, + "clip_ratio/low_min": 7.29296516510658e-06, + "clip_ratio/region_mean": 3.675130722058384e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 8420.6875, + "completions/mean_terminated_length": 8096.97509765625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "entropy": 0.9572964608669281, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022430522367358208, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 229183765.0, + "reward": 0.34375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 0.00029693738906644285, + "sampling/sampling_logp_difference/max": 8.121989250183105, + "sampling/sampling_logp_difference/mean": 0.021570362150669098, + "step": 266 + }, + { + "clip_ratio/high_max": 6.728750577167375e-06, + "clip_ratio/high_mean": 1.6821876442918438e-06, + "clip_ratio/low_mean": 2.1682553096979973e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.336474062758498e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15736.0, + "completions/mean_length": 6809.765625, + "completions/mean_terminated_length": 6579.984375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.884086549282074, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004295065999031067, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 230077607.0, + "reward": 0.484375, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00754612497985363, + "sampling/sampling_logp_difference/max": 4.886721134185791, + "sampling/sampling_logp_difference/mean": 0.019895706325769424, + "step": 267 + }, + { + "clip_ratio/high_max": 2.8609347509700456e-05, + "clip_ratio/high_mean": 7.152336877425114e-06, + "clip_ratio/low_mean": 5.158006410965754e-05, + "clip_ratio/low_min": 5.210069957684027e-06, + "clip_ratio/region_mean": 5.873240070286556e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15080.0, + "completions/mean_length": 7340.6953125, + "completions/mean_terminated_length": 6973.0810546875, + "completions/min_length": 1616.0, + "completions/min_terminated_length": 1616.0, + "entropy": 0.9920620769262314, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004631794057786465, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 231035616.0, + "reward": 0.4375, + "reward_std": 0.3235401213169098, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.0002508950710762292, + "sampling/sampling_logp_difference/max": 8.290475845336914, + "sampling/sampling_logp_difference/mean": 0.020591016858816147, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.3085940774290066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3085940774290066e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14120.0, + "completions/mean_length": 6748.875, + "completions/mean_terminated_length": 6595.93701171875, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.9867061004042625, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035752104595303535, + "learning_rate": 1e-05, + "loss": 0.0455, + "num_tokens": 231920056.0, + "reward": 0.40625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.0003869794018100947, + "sampling/sampling_logp_difference/max": 7.8571391105651855, + "sampling/sampling_logp_difference/mean": 0.02061416581273079, + "step": 269 + }, + { + "clip_ratio/high_max": 1.2506750408647349e-05, + "clip_ratio/high_mean": 3.1266876021618373e-06, + "clip_ratio/low_mean": 3.10397430212106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.416643085074611e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 7260.3046875, + "completions/mean_terminated_length": 7188.46435546875, + "completions/min_length": 1384.0, + "completions/min_terminated_length": 1384.0, + "entropy": 1.0388494208455086, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036644963547587395, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 232869159.0, + "reward": 0.390625, + "reward_std": 0.2359209954738617, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999546408653259, + "sampling/importance_sampling_ratio/min": 0.0008660226594656706, + "sampling/sampling_logp_difference/max": 7.051599502563477, + "sampling/sampling_logp_difference/mean": 0.02120530977845192, + "step": 270 + }, + { + "clip_ratio/high_max": 2.704355301830219e-05, + "clip_ratio/high_mean": 6.760888254575548e-06, + "clip_ratio/low_mean": 3.1861192269388994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862208097871189e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16073.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 6354.4609375, + "completions/mean_terminated_length": 6354.4609375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "entropy": 0.8405331820249557, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004709267523139715, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 233702842.0, + "reward": 0.546875, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 0.0046309432946145535, + "sampling/sampling_logp_difference/max": 5.37499475479126, + "sampling/sampling_logp_difference/mean": 0.019126038998365402, + "step": 271 + }, + { + "clip_ratio/high_max": 9.749228638611385e-06, + "clip_ratio/high_mean": 2.437307159652846e-06, + "clip_ratio/low_mean": 3.855073941849696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.098804652130639e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6514.578125, + "completions/mean_terminated_length": 6357.9208984375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 1.0254098922014236, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003066045930609107, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 234556348.0, + "reward": 0.4375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 0.005210204049944878, + "sampling/sampling_logp_difference/max": 5.257136344909668, + "sampling/sampling_logp_difference/mean": 0.019960148259997368, + "step": 272 + }, + { + "clip_ratio/high_max": 1.0475813724042382e-05, + "clip_ratio/high_mean": 2.6189534310105955e-06, + "clip_ratio/low_mean": 3.487835761006863e-05, + "clip_ratio/low_min": 2.9392399483185727e-06, + "clip_ratio/region_mean": 3.749731081370555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 7379.5546875, + "completions/mean_terminated_length": 7236.62744140625, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 1.0397320613265038, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005132520105689764, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 235521091.0, + "reward": 0.2890625, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999256134033203, + "sampling/importance_sampling_ratio/min": 0.00016659013635944575, + "sampling/sampling_logp_difference/max": 8.699974060058594, + "sampling/sampling_logp_difference/mean": 0.021417103707790375, + "step": 273 + }, + { + "clip_ratio/high_max": 1.9904123973901733e-05, + "clip_ratio/high_mean": 5.776861314643611e-06, + "clip_ratio/low_mean": 2.6659268655748747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2436129686175263e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 7837.1640625, + "completions/mean_terminated_length": 7632.04052734375, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "entropy": 0.8400963917374611, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028969801496714354, + "learning_rate": 1e-05, + "loss": 0.0143, + "num_tokens": 236544160.0, + "reward": 0.3828125, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887943267822, + "sampling/importance_sampling_ratio/min": 2.883308241052873e-07, + "sampling/sampling_logp_difference/max": 15.059157371520996, + "sampling/sampling_logp_difference/mean": 0.019267702475190163, + "step": 274 + }, + { + "clip_ratio/high_max": 8.562770290154731e-06, + "clip_ratio/high_mean": 2.1406925725386827e-06, + "clip_ratio/low_mean": 4.060094340729847e-05, + "clip_ratio/low_min": 3.8700886761944275e-06, + "clip_ratio/region_mean": 4.2741635979837156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15350.0, + "completions/mean_length": 6696.3515625, + "completions/mean_terminated_length": 6542.57958984375, + "completions/min_length": 1239.0, + "completions/min_terminated_length": 1239.0, + "entropy": 0.8495818004012108, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003412836929783225, + "learning_rate": 1e-05, + "loss": 0.0803, + "num_tokens": 237423101.0, + "reward": 0.515625, + "reward_std": 0.37981897592544556, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.012152798473834991, + "sampling/sampling_logp_difference/max": 4.410195827484131, + "sampling/sampling_logp_difference/mean": 0.018458625301718712, + "step": 275 + }, + { + "clip_ratio/high_max": 1.1463653436294408e-05, + "clip_ratio/high_mean": 3.646129641765583e-06, + "clip_ratio/low_mean": 6.144847083078275e-05, + "clip_ratio/low_min": 1.110105540647055e-05, + "clip_ratio/region_mean": 6.509460160941671e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15666.0, + "completions/mean_length": 7700.3671875, + "completions/mean_terminated_length": 7121.45849609375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.8258870914578438, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024443145375698805, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 238429956.0, + "reward": 0.375, + "reward_std": 0.2872493863105774, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999113082885742, + "sampling/importance_sampling_ratio/min": 0.00026112530031241477, + "sampling/sampling_logp_difference/max": 8.250510215759277, + "sampling/sampling_logp_difference/mean": 0.019427984952926636, + "step": 276 + }, + { + "clip_ratio/high_max": 4.218127742205979e-06, + "clip_ratio/high_mean": 1.0545319355514948e-06, + "clip_ratio/low_mean": 1.7289162997258245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.834369493280974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16112.0, + "completions/mean_length": 6255.21875, + "completions/mean_terminated_length": 6094.44482421875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.8179014846682549, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022747826296836138, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 239250160.0, + "reward": 0.5234375, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.0002633975527714938, + "sampling/sampling_logp_difference/max": 8.241846084594727, + "sampling/sampling_logp_difference/mean": 0.018723051995038986, + "step": 277 + }, + { + "clip_ratio/high_max": 1.698448841125355e-05, + "clip_ratio/high_mean": 5.369374321162468e-06, + "clip_ratio/low_mean": 6.14647315160255e-05, + "clip_ratio/low_min": 5.043576493335422e-06, + "clip_ratio/region_mean": 6.683410583718796e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15321.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6914.9609375, + "completions/mean_terminated_length": 6914.9609375, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9700981751084328, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005685295443981886, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 240156211.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05, + "sampling/sampling_logp_difference/max": 9.997581481933594, + "sampling/sampling_logp_difference/mean": 0.021195171400904655, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9186837764427764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9186837764427764e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15469.0, + "completions/mean_length": 5227.53125, + "completions/mean_terminated_length": 5139.68505859375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9116031974554062, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003880272386595607, + "learning_rate": 1e-05, + "loss": 0.1246, + "num_tokens": 240845295.0, + "reward": 0.6328125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000362396240234, + "sampling/importance_sampling_ratio/min": 0.00012422871077433228, + "sampling/sampling_logp_difference/max": 8.993386268615723, + "sampling/sampling_logp_difference/mean": 0.018801718950271606, + "step": 279 + }, + { + "clip_ratio/high_max": 2.5015486926349695e-05, + "clip_ratio/high_mean": 8.084949570275057e-06, + "clip_ratio/low_mean": 5.524710468307603e-05, + "clip_ratio/low_min": 3.776891389861703e-06, + "clip_ratio/region_mean": 6.333205465125502e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 8065.4765625, + "completions/mean_terminated_length": 7510.90869140625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.7446574792265892, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028986844699829817, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 241895676.0, + "reward": 0.4921875, + "reward_std": 0.3474721610546112, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.0017039099475368857, + "sampling/sampling_logp_difference/max": 6.3748297691345215, + "sampling/sampling_logp_difference/mean": 0.01853121444582939, + "step": 280 + }, + { + "clip_ratio/high_max": 9.486341014053323e-06, + "clip_ratio/high_mean": 2.371585253513331e-06, + "clip_ratio/low_mean": 2.896106741445692e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133265261112683e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15534.0, + "completions/max_terminated_length": 15534.0, + "completions/mean_length": 6127.359375, + "completions/mean_terminated_length": 6127.359375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.8569132760167122, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003845847910270095, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 242698258.0, + "reward": 0.53125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000942945480347, + "sampling/importance_sampling_ratio/min": 0.00043231461313553154, + "sampling/sampling_logp_difference/max": 7.746356964111328, + "sampling/sampling_logp_difference/mean": 0.01856958493590355, + "step": 281 + }, + { + "clip_ratio/high_max": 2.9848330086679198e-05, + "clip_ratio/high_mean": 7.4620825216697995e-06, + "clip_ratio/low_mean": 4.3558867673709756e-05, + "clip_ratio/low_min": 4.417741820361698e-06, + "clip_ratio/region_mean": 5.1020949285884853e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15192.0, + "completions/mean_length": 6600.1484375, + "completions/mean_terminated_length": 6365.33642578125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.78924310952425, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003953634761273861, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 243560957.0, + "reward": 0.5546875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.0006525487406179309, + "sampling/sampling_logp_difference/max": 7.334624767303467, + "sampling/sampling_logp_difference/mean": 0.018097909167408943, + "step": 282 + }, + { + "clip_ratio/high_max": 6.635561703660642e-06, + "clip_ratio/high_mean": 1.6588904259151604e-06, + "clip_ratio/low_mean": 2.737523408313791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9034124281679397e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7852.171875, + "completions/mean_terminated_length": 7852.171875, + "completions/min_length": 1276.0, + "completions/min_terminated_length": 1276.0, + "entropy": 1.0598893761634827, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00360781978815794, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 244585923.0, + "reward": 0.3125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05, + "sampling/sampling_logp_difference/max": 10.076086044311523, + "sampling/sampling_logp_difference/mean": 0.022330068051815033, + "step": 283 + }, + { + "clip_ratio/high_max": 3.1540168947685743e-06, + "clip_ratio/high_mean": 7.885042236921436e-07, + "clip_ratio/low_mean": 4.7973388973332476e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.876189268543385e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7972.2265625, + "completions/mean_terminated_length": 7700.87890625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.933217465877533, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0027661293279379606, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 245628064.0, + "reward": 0.28125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05, + "sampling/sampling_logp_difference/max": 10.366576194763184, + "sampling/sampling_logp_difference/mean": 0.021125148981809616, + "step": 284 + }, + { + "clip_ratio/high_max": 1.2965969062861404e-05, + "clip_ratio/high_mean": 3.241492265715351e-06, + "clip_ratio/low_mean": 4.6317693090713874e-05, + "clip_ratio/low_min": 3.820877282123547e-06, + "clip_ratio/region_mean": 4.955918507221213e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7135.6953125, + "completions/mean_terminated_length": 6913.736328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.7786942347884178, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005680318456143141, + "learning_rate": 1e-05, + "loss": 0.0786, + "num_tokens": 246561329.0, + "reward": 0.4296875, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462366104126, + "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05, + "sampling/sampling_logp_difference/max": 9.737424850463867, + "sampling/sampling_logp_difference/mean": 0.018504241481423378, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.22437145175536e-05, + "clip_ratio/low_min": 1.4025082009538892e-05, + "clip_ratio/region_mean": 4.22437145175536e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6704.046875, + "completions/mean_terminated_length": 6627.82666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 1.0435140281915665, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026402862276881933, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 247437415.0, + "reward": 0.3828125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.0007800163584761322, + "sampling/sampling_logp_difference/max": 7.156195640563965, + "sampling/sampling_logp_difference/mean": 0.02134273201227188, + "step": 286 + }, + { + "clip_ratio/high_max": 2.223430897174694e-05, + "clip_ratio/high_mean": 6.8746438159905665e-06, + "clip_ratio/low_mean": 4.7084630978133646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3959275192028144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 5892.5078125, + "completions/mean_terminated_length": 5725.9765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.8004944771528244, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003993614576756954, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 248211112.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0024652592837810516, + "sampling/sampling_logp_difference/max": 6.005458354949951, + "sampling/sampling_logp_difference/mean": 0.01924925297498703, + "step": 287 + }, + { + "clip_ratio/high_max": 2.1833082200828358e-05, + "clip_ratio/high_mean": 5.458270550207089e-06, + "clip_ratio/low_mean": 3.415995615796419e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961822596920683e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 7812.140625, + "completions/mean_terminated_length": 7316.24755859375, + "completions/min_length": 1515.0, + "completions/min_terminated_length": 1515.0, + "entropy": 0.8841542899608612, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001573400106281042, + "learning_rate": 1e-05, + "loss": 0.0823, + "num_tokens": 249228106.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 0.001001527882181108, + "sampling/sampling_logp_difference/max": 6.906228542327881, + "sampling/sampling_logp_difference/mean": 0.01956877112388611, + "step": 288 + }, + { + "clip_ratio/high_max": 1.014439021673752e-05, + "clip_ratio/high_mean": 2.53609755418438e-06, + "clip_ratio/low_mean": 3.068193461785995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.321803217204433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 6372.953125, + "completions/mean_terminated_length": 6132.6884765625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.8228401988744736, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021125099156051874, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 250063284.0, + "reward": 0.5, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05, + "sampling/sampling_logp_difference/max": 9.937475204467773, + "sampling/sampling_logp_difference/mean": 0.01943521574139595, + "step": 289 + }, + { + "clip_ratio/high_max": 7.023906164249638e-06, + "clip_ratio/high_mean": 1.7559765410624095e-06, + "clip_ratio/low_mean": 2.526416994896863e-05, + "clip_ratio/low_min": 6.7760895490209805e-06, + "clip_ratio/region_mean": 2.7020146660561295e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16270.0, + "completions/mean_length": 7817.8671875, + "completions/mean_terminated_length": 7396.58154296875, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.9454319775104523, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022315154783427715, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 251085123.0, + "reward": 0.40625, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06, + "sampling/sampling_logp_difference/max": 12.760490417480469, + "sampling/sampling_logp_difference/mean": 0.021764669567346573, + "step": 290 + }, + { + "clip_ratio/high_max": 1.4797966287005693e-05, + "clip_ratio/high_mean": 3.699491571751423e-06, + "clip_ratio/low_mean": 4.36271948274225e-05, + "clip_ratio/low_min": 3.6957101201551268e-06, + "clip_ratio/region_mean": 4.732668639917392e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 7168.4921875, + "completions/mean_terminated_length": 6635.36328125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8433891162276268, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 252020906.0, + "reward": 0.5546875, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.0003851866349577904, + "sampling/sampling_logp_difference/max": 7.861782550811768, + "sampling/sampling_logp_difference/mean": 0.01929781585931778, + "step": 291 + }, + { + "clip_ratio/high_max": 1.996871560550062e-05, + "clip_ratio/high_mean": 6.089093403716106e-06, + "clip_ratio/low_mean": 4.2792244585143635e-05, + "clip_ratio/low_min": 1.0337215371691855e-05, + "clip_ratio/region_mean": 4.8881338216233416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7322.5078125, + "completions/mean_terminated_length": 6876.8603515625, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 0.9157031401991844, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036942458245903254, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 252977435.0, + "reward": 0.3359375, + "reward_std": 0.24275577068328857, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.00029605376766994596, + "sampling/sampling_logp_difference/max": 8.124969482421875, + "sampling/sampling_logp_difference/mean": 0.0205365102738142, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.631919460327481e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.631919460327481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16078.0, + "completions/mean_length": 7025.484375, + "completions/mean_terminated_length": 6723.5966796875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 1.1329731941223145, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034127074759453535, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 253896161.0, + "reward": 0.25, + "reward_std": 0.27722424268722534, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0005197672289796174, + "sampling/sampling_logp_difference/max": 7.562129497528076, + "sampling/sampling_logp_difference/mean": 0.023741140961647034, + "step": 293 + }, + { + "clip_ratio/high_max": 4.368643658381188e-06, + "clip_ratio/high_mean": 1.092160914595297e-06, + "clip_ratio/low_mean": 2.4661783299961826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5753944555617636e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13776.0, + "completions/mean_length": 5996.1796875, + "completions/mean_terminated_length": 5661.08837890625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8773328885436058, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003959407564252615, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 254690264.0, + "reward": 0.53125, + "reward_std": 0.26645541191101074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07, + "sampling/sampling_logp_difference/max": 15.73043155670166, + "sampling/sampling_logp_difference/mean": 0.018407585099339485, + "step": 294 + }, + { + "clip_ratio/high_max": 1.616483677935321e-05, + "clip_ratio/high_mean": 4.041209194838302e-06, + "clip_ratio/low_mean": 3.736187466074625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140308453770558e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7165.328125, + "completions/mean_terminated_length": 6867.951171875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.9502597972750664, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030910037457942963, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 255626394.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000731945037842, + "sampling/importance_sampling_ratio/min": 0.00022311302018351853, + "sampling/sampling_logp_difference/max": 8.407832145690918, + "sampling/sampling_logp_difference/mean": 0.020668907091021538, + "step": 295 + }, + { + "clip_ratio/high_max": 1.1702686606440693e-05, + "clip_ratio/high_mean": 2.9256716516101733e-06, + "clip_ratio/low_mean": 5.5247357522603124e-05, + "clip_ratio/low_min": 3.6811261452385224e-06, + "clip_ratio/region_mean": 5.8173028264718596e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15375.0, + "completions/mean_length": 8001.9296875, + "completions/mean_terminated_length": 7661.34912109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8591345250606537, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037233952898532152, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 256673457.0, + "reward": 0.421875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.0021876997780054808, + "sampling/sampling_logp_difference/max": 6.124904632568359, + "sampling/sampling_logp_difference/mean": 0.020540472120046616, + "step": 296 + }, + { + "clip_ratio/high_max": 3.721341136042611e-05, + "clip_ratio/high_mean": 1.2759249216287571e-05, + "clip_ratio/low_mean": 3.570647322703735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.846572301175911e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 6924.84375, + "completions/mean_terminated_length": 6697.82421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.7969356626272202, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006054217461496592, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 257578501.0, + "reward": 0.5078125, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.007889713160693645, + "sampling/sampling_logp_difference/max": 4.842195510864258, + "sampling/sampling_logp_difference/mean": 0.019306108355522156, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0211543894911301e-05, + "clip_ratio/high_mean": 2.5528859737278253e-06, + "clip_ratio/low_mean": 5.2388056587915344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4940942732173426e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14439.0, + "completions/mean_length": 6203.03125, + "completions/mean_terminated_length": 5958.6884765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.8734413683414459, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004903806839138269, + "learning_rate": 1e-05, + "loss": 0.0689, + "num_tokens": 258392625.0, + "reward": 0.4453125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 0.00020370795391499996, + "sampling/sampling_logp_difference/max": 8.498823165893555, + "sampling/sampling_logp_difference/mean": 0.01909301057457924, + "step": 298 + }, + { + "clip_ratio/high_max": 1.5135058674786706e-05, + "clip_ratio/high_mean": 4.64845766146027e-06, + "clip_ratio/low_mean": 4.373456977191381e-05, + "clip_ratio/low_min": 3.670856358439778e-06, + "clip_ratio/region_mean": 4.8383026296505705e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 7982.5390625, + "completions/mean_terminated_length": 7641.01611328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0091779381036758, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033637424930930138, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 259435270.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999765753746033, + "sampling/importance_sampling_ratio/min": 0.0016514655435457826, + "sampling/sampling_logp_difference/max": 6.406092166900635, + "sampling/sampling_logp_difference/mean": 0.02182736061513424, + "step": 299 + }, + { + "clip_ratio/high_max": 2.3964702677403693e-05, + "clip_ratio/high_mean": 5.991175669350923e-06, + "clip_ratio/low_mean": 5.2442986770984135e-05, + "clip_ratio/low_min": 8.75736759553547e-06, + "clip_ratio/region_mean": 5.843416238349164e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6915.3125, + "completions/mean_terminated_length": 6688.064453125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.7964543774724007, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052203768864274025, + "learning_rate": 1e-05, + "loss": 0.144, + "num_tokens": 260337614.0, + "reward": 0.46875, + "reward_std": 0.37928223609924316, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 7.032832218101248e-05, + "sampling/sampling_logp_difference/max": 9.562335968017578, + "sampling/sampling_logp_difference/mean": 0.017896221950650215, + "step": 300 + }, + { + "clip_ratio/high_max": 4.458271632756805e-05, + "clip_ratio/high_mean": 1.1145679081892013e-05, + "clip_ratio/low_mean": 6.243192206056847e-05, + "clip_ratio/low_min": 1.2397775662975619e-05, + "clip_ratio/region_mean": 7.357759886872373e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7029.4375, + "completions/mean_terminated_length": 6880.95263671875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.8605096861720085, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005570738110691309, + "learning_rate": 1e-05, + "loss": 0.0984, + "num_tokens": 261254070.0, + "reward": 0.4765625, + "reward_std": 0.3327290117740631, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999494552612305, + "sampling/importance_sampling_ratio/min": 0.0009070249507203698, + "sampling/sampling_logp_difference/max": 7.005340576171875, + "sampling/sampling_logp_difference/mean": 0.01905740052461624, + "step": 301 + }, + { + "clip_ratio/high_max": 3.390461233720998e-05, + "clip_ratio/high_mean": 1.1191766247975465e-05, + "clip_ratio/low_mean": 7.46641262594494e-05, + "clip_ratio/low_min": 5.041745680500753e-06, + "clip_ratio/region_mean": 8.585589102949598e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5858.84375, + "completions/mean_terminated_length": 5606.240234375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8430554121732712, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004496110137552023, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 262024906.0, + "reward": 0.4453125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294877052307, + "sampling/importance_sampling_ratio/min": 0.00040469475788995624, + "sampling/sampling_logp_difference/max": 7.812377452850342, + "sampling/sampling_logp_difference/mean": 0.019225869327783585, + "step": 302 + }, + { + "clip_ratio/high_max": 3.2563955301156966e-06, + "clip_ratio/high_mean": 8.140988825289242e-07, + "clip_ratio/low_mean": 3.7080020149460324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.789411886145899e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15976.0, + "completions/mean_length": 8337.328125, + "completions/mean_terminated_length": 7728.7568359375, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.901745393872261, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00348713924176991, + "learning_rate": 1e-05, + "loss": -0.0002, + "num_tokens": 263110844.0, + "reward": 0.296875, + "reward_std": 0.20805485546588898, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.0022652465850114822, + "sampling/sampling_logp_difference/max": 6.090071678161621, + "sampling/sampling_logp_difference/mean": 0.02157524600625038, + "step": 303 + }, + { + "clip_ratio/high_max": 2.3739744847262045e-05, + "clip_ratio/high_mean": 5.934936211815511e-06, + "clip_ratio/low_mean": 2.823553325015382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.417046866616147e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7084.7265625, + "completions/mean_terminated_length": 6381.42041015625, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8265534415841103, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003980033565312624, + "learning_rate": 1e-05, + "loss": 0.0551, + "num_tokens": 264036169.0, + "reward": 0.3984375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673366546631, + "sampling/importance_sampling_ratio/min": 0.00012345099821686745, + "sampling/sampling_logp_difference/max": 8.999666213989258, + "sampling/sampling_logp_difference/mean": 0.018782664090394974, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1745505617000163e-05, + "clip_ratio/high_mean": 3.771558226617344e-06, + "clip_ratio/low_mean": 6.913120819262986e-05, + "clip_ratio/low_min": 2.494283216947224e-05, + "clip_ratio/region_mean": 7.290276607818669e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6543.796875, + "completions/mean_terminated_length": 6543.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8899869695305824, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.006467343773692846, + "learning_rate": 1e-05, + "loss": 0.1139, + "num_tokens": 264892767.0, + "reward": 0.484375, + "reward_std": 0.3934885561466217, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000489950180054, + "sampling/importance_sampling_ratio/min": 9.891482477542013e-05, + "sampling/sampling_logp_difference/max": 9.221251487731934, + "sampling/sampling_logp_difference/mean": 0.02032080665230751, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.395576979732141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.395576979732141e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16307.0, + "completions/mean_length": 8483.390625, + "completions/mean_terminated_length": 7813.84765625, + "completions/min_length": 1342.0, + "completions/min_terminated_length": 1342.0, + "entropy": 0.9621479511260986, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003174177836626768, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 265995697.0, + "reward": 0.3359375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.0005628522485494614, + "sampling/sampling_logp_difference/max": 7.4824934005737305, + "sampling/sampling_logp_difference/mean": 0.02145479805767536, + "step": 306 + }, + { + "clip_ratio/high_max": 1.2596524811669951e-05, + "clip_ratio/high_mean": 3.149131202917488e-06, + "clip_ratio/low_mean": 3.7911659774181317e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.106079018129094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14985.0, + "completions/mean_length": 7184.578125, + "completions/mean_terminated_length": 6963.79248046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9993807673454285, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003356153378263116, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 266937707.0, + "reward": 0.3828125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.0017036627978086472, + "sampling/sampling_logp_difference/max": 6.374974727630615, + "sampling/sampling_logp_difference/mean": 0.02204768732190132, + "step": 307 + }, + { + "clip_ratio/high_max": 1.9245163684900035e-05, + "clip_ratio/high_mean": 4.811290921225009e-06, + "clip_ratio/low_mean": 4.8845648166206956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.365693925796222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16216.0, + "completions/mean_length": 7029.2265625, + "completions/mean_terminated_length": 6727.45947265625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.9139953926205635, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006375293247401714, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 267853880.0, + "reward": 0.4765625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.010649868287146091, + "sampling/sampling_logp_difference/max": 4.542207717895508, + "sampling/sampling_logp_difference/mean": 0.020365029573440552, + "step": 308 + }, + { + "clip_ratio/high_max": 4.812504812434781e-06, + "clip_ratio/high_mean": 1.2031262031086953e-06, + "clip_ratio/low_mean": 2.5999243803198624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.720237000630732e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6188.0078125, + "completions/mean_terminated_length": 5943.30419921875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.7640773430466652, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003697809297591448, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 268665721.0, + "reward": 0.5078125, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372363090515, + "sampling/importance_sampling_ratio/min": 0.02927250787615776, + "sampling/sampling_logp_difference/max": 3.531106472015381, + "sampling/sampling_logp_difference/mean": 0.016581017524003983, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1358927824621787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1358927824621787e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 8128.21875, + "completions/mean_terminated_length": 7861.90283203125, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.8218234181404114, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002286596456542611, + "learning_rate": 1e-05, + "loss": 0.0763, + "num_tokens": 269726181.0, + "reward": 0.375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798536300659, + "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06, + "sampling/sampling_logp_difference/max": 12.90043830871582, + "sampling/sampling_logp_difference/mean": 0.019403984770178795, + "step": 310 + }, + { + "clip_ratio/high_max": 1.4808477317274082e-05, + "clip_ratio/high_mean": 3.7021193293185206e-06, + "clip_ratio/low_mean": 3.0363167581981543e-05, + "clip_ratio/low_min": 6.364238288369961e-06, + "clip_ratio/region_mean": 3.4065286854456645e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 5673.3359375, + "completions/mean_terminated_length": 5503.32568359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.9275510385632515, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00485506234690547, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 270470616.0, + "reward": 0.4921875, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.0009123464697040617, + "sampling/sampling_logp_difference/max": 6.999490737915039, + "sampling/sampling_logp_difference/mean": 0.01881871558725834, + "step": 311 + }, + { + "clip_ratio/high_max": 1.1274602456978755e-05, + "clip_ratio/high_mean": 3.6739949109687586e-06, + "clip_ratio/low_mean": 3.968570712231667e-05, + "clip_ratio/low_min": 3.4213767321489286e-06, + "clip_ratio/region_mean": 4.335970191959859e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 6944.8984375, + "completions/mean_terminated_length": 6795.07177734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9335741624236107, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005874342750757933, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 271377723.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000594854354858, + "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05, + "sampling/sampling_logp_difference/max": 10.049861907958984, + "sampling/sampling_logp_difference/mean": 0.020590776577591896, + "step": 312 + }, + { + "clip_ratio/high_max": 1.264126694877632e-05, + "clip_ratio/high_mean": 3.16031673719408e-06, + "clip_ratio/low_mean": 3.206376845810155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.522408474054828e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7705.625, + "completions/mean_terminated_length": 7278.8193359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.8491624072194099, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001684082904830575, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 272384891.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 6.605865200981498e-05, + "sampling/sampling_logp_difference/max": 9.624967575073242, + "sampling/sampling_logp_difference/mean": 0.020136822015047073, + "step": 313 + }, + { + "clip_ratio/high_max": 9.772357770998497e-06, + "clip_ratio/high_mean": 2.443089442749624e-06, + "clip_ratio/low_mean": 3.8573590472879005e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101667946088128e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6611.1484375, + "completions/mean_terminated_length": 6534.19677734375, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "entropy": 0.8867302760481834, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003692191792652011, + "learning_rate": 1e-05, + "loss": 0.1233, + "num_tokens": 273251630.0, + "reward": 0.3984375, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.0031062732450664043, + "sampling/sampling_logp_difference/max": 5.774331569671631, + "sampling/sampling_logp_difference/mean": 0.019237037748098373, + "step": 314 + }, + { + "clip_ratio/high_max": 3.0103737344688852e-05, + "clip_ratio/high_mean": 9.664363972206047e-06, + "clip_ratio/low_mean": 1.7575501146893657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.723986426644842e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15786.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 6770.46875, + "completions/mean_terminated_length": 6770.46875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.8252957463264465, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004167635925114155, + "learning_rate": 1e-05, + "loss": -0.0072, + "num_tokens": 274146482.0, + "reward": 0.5703125, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.00010247006866848096, + "sampling/sampling_logp_difference/max": 9.18593978881836, + "sampling/sampling_logp_difference/mean": 0.019684650003910065, + "step": 315 + }, + { + "clip_ratio/high_max": 6.529460733872838e-06, + "clip_ratio/high_mean": 1.6323651834682096e-06, + "clip_ratio/low_mean": 3.877351048231503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.040587566578324e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15827.0, + "completions/mean_length": 8210.859375, + "completions/mean_terminated_length": 7365.36181640625, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.8118235394358635, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030363225378096104, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 275214040.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998943209648132, + "sampling/importance_sampling_ratio/min": 0.002854935359209776, + "sampling/sampling_logp_difference/max": 5.858705997467041, + "sampling/sampling_logp_difference/mean": 0.019275270402431488, + "step": 316 + }, + { + "clip_ratio/high_max": 7.0800629146106075e-06, + "clip_ratio/high_mean": 1.7700157286526519e-06, + "clip_ratio/low_mean": 2.3981688286767167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5751703674359305e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14900.0, + "completions/mean_length": 7072.8828125, + "completions/mean_terminated_length": 6849.41650390625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8018335327506065, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004777858033776283, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 276138049.0, + "reward": 0.453125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 0.0028502768836915493, + "sampling/sampling_logp_difference/max": 5.860339164733887, + "sampling/sampling_logp_difference/mean": 0.01849908009171486, + "step": 317 + }, + { + "clip_ratio/high_max": 2.259368602608447e-05, + "clip_ratio/high_mean": 5.648421506521117e-06, + "clip_ratio/low_mean": 4.28424866640853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.849090737479855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14447.0, + "completions/mean_length": 5889.8359375, + "completions/mean_terminated_length": 5723.26220703125, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.7976400703191757, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030593445990234613, + "learning_rate": 1e-05, + "loss": 0.1331, + "num_tokens": 276910124.0, + "reward": 0.5859375, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.000139843366923742, + "sampling/sampling_logp_difference/max": 8.874987602233887, + "sampling/sampling_logp_difference/mean": 0.01834402233362198, + "step": 318 + }, + { + "clip_ratio/high_max": 1.4654247024736833e-05, + "clip_ratio/high_mean": 3.663561756184208e-06, + "clip_ratio/low_mean": 2.377464920755301e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7438210736363544e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 7144.265625, + "completions/mean_terminated_length": 6689.85205078125, + "completions/min_length": 1200.0, + "completions/min_terminated_length": 1200.0, + "entropy": 0.8309404999017715, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004245694726705551, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 277843542.0, + "reward": 0.4453125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998534321784973, + "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05, + "sampling/sampling_logp_difference/max": 11.499897956848145, + "sampling/sampling_logp_difference/mean": 0.01875344291329384, + "step": 319 + }, + { + "clip_ratio/high_max": 6.252500952541595e-06, + "clip_ratio/high_mean": 2.241558604509919e-06, + "clip_ratio/low_mean": 4.735765514851664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9599213525652885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15722.0, + "completions/mean_length": 6779.5234375, + "completions/mean_terminated_length": 6703.8974609375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.9584890529513359, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035574575886130333, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 278730129.0, + "reward": 0.3984375, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.005792221520096064, + "sampling/sampling_logp_difference/max": 5.151239395141602, + "sampling/sampling_logp_difference/mean": 0.02137477695941925, + "step": 320 + }, + { + "clip_ratio/high_max": 3.2948471016425174e-05, + "clip_ratio/high_mean": 9.518853403278627e-06, + "clip_ratio/low_mean": 2.195712454522436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.14759782895635e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15892.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 5582.9765625, + "completions/mean_terminated_length": 5582.9765625, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8629376217722893, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037982752546668053, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 279462542.0, + "reward": 0.5546875, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999780058860779, + "sampling/importance_sampling_ratio/min": 0.0021874974481761456, + "sampling/sampling_logp_difference/max": 6.124997138977051, + "sampling/sampling_logp_difference/mean": 0.01906203106045723, + "step": 321 + }, + { + "clip_ratio/high_max": 1.1029473625967512e-05, + "clip_ratio/high_mean": 2.757368406491878e-06, + "clip_ratio/low_mean": 5.367386921761863e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6431237737797346e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 6942.2578125, + "completions/mean_terminated_length": 6477.90966796875, + "completions/min_length": 1156.0, + "completions/min_terminated_length": 1156.0, + "entropy": 0.8147861957550049, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027678858023136854, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 280370207.0, + "reward": 0.4375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998471736907959, + "sampling/importance_sampling_ratio/min": 0.00023058800434228033, + "sampling/sampling_logp_difference/max": 8.3748779296875, + "sampling/sampling_logp_difference/mean": 0.01940828748047352, + "step": 322 + }, + { + "clip_ratio/high_max": 2.6367894406575942e-05, + "clip_ratio/high_mean": 8.765707434577052e-06, + "clip_ratio/low_mean": 3.232976985145797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.109547796815605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6242.53125, + "completions/mean_terminated_length": 5915.38671875, + "completions/min_length": 1220.0, + "completions/min_terminated_length": 1220.0, + "entropy": 0.878915011882782, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00577945914119482, + "learning_rate": 1e-05, + "loss": 0.0839, + "num_tokens": 281189491.0, + "reward": 0.515625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 9.611724817659706e-05, + "sampling/sampling_logp_difference/max": 9.2499418258667, + "sampling/sampling_logp_difference/mean": 0.01948760263621807, + "step": 323 + }, + { + "clip_ratio/high_max": 3.50839609382092e-05, + "clip_ratio/high_mean": 1.1664920634757436e-05, + "clip_ratio/low_mean": 1.833109013205103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9996010880495305e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 7004.015625, + "completions/mean_terminated_length": 6622.71533203125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.7964659407734871, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014128695474937558, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 282103997.0, + "reward": 0.4140625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.0024504722096025944, + "sampling/sampling_logp_difference/max": 6.011474609375, + "sampling/sampling_logp_difference/mean": 0.019019678235054016, + "step": 324 + }, + { + "clip_ratio/high_max": 1.832260545597819e-05, + "clip_ratio/high_mean": 4.580651363994548e-06, + "clip_ratio/low_mean": 5.309064226821647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.767129368905444e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7822.6953125, + "completions/mean_terminated_length": 7546.52392578125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.8571138679981232, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002476039342582226, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 283122382.0, + "reward": 0.4609375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.0009774373611435294, + "sampling/sampling_logp_difference/max": 6.930576324462891, + "sampling/sampling_logp_difference/mean": 0.020557202398777008, + "step": 325 + }, + { + "clip_ratio/high_max": 5.738419986300869e-06, + "clip_ratio/high_mean": 1.4346049965752172e-06, + "clip_ratio/low_mean": 4.19679121819172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3402517292179255e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7738.8984375, + "completions/mean_terminated_length": 6844.57763671875, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "entropy": 0.7839021533727646, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005309853237122297, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 284130081.0, + "reward": 0.5234375, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998971223831177, + "sampling/importance_sampling_ratio/min": 0.0001319014554610476, + "sampling/sampling_logp_difference/max": 8.933455467224121, + "sampling/sampling_logp_difference/mean": 0.01873316988348961, + "step": 326 + }, + { + "clip_ratio/high_max": 1.007085802484653e-05, + "clip_ratio/high_mean": 2.5177145062116324e-06, + "clip_ratio/low_mean": 4.043528815600439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.295300277590286e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15952.0, + "completions/mean_length": 7102.2421875, + "completions/mean_terminated_length": 6954.9130859375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8530801385641098, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228116944432259, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 285058720.0, + "reward": 0.5078125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00012956927821505815, + "sampling/sampling_logp_difference/max": 8.951294898986816, + "sampling/sampling_logp_difference/mean": 0.019325006753206253, + "step": 327 + }, + { + "clip_ratio/high_max": 4.06874551117653e-06, + "clip_ratio/high_mean": 1.0171863777941326e-06, + "clip_ratio/low_mean": 3.661125703047219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.762844340826632e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15594.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6583.4765625, + "completions/mean_terminated_length": 6583.4765625, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.021921381354332, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004967439454048872, + "learning_rate": 1e-05, + "loss": 0.0374, + "num_tokens": 285919765.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.016675354912877083, + "sampling/sampling_logp_difference/max": 4.093823432922363, + "sampling/sampling_logp_difference/mean": 0.021393200382590294, + "step": 328 + }, + { + "clip_ratio/high_max": 1.2215251445013564e-05, + "clip_ratio/high_mean": 3.053812861253391e-06, + "clip_ratio/low_mean": 4.05305947879242e-05, + "clip_ratio/low_min": 4.215567059873138e-06, + "clip_ratio/region_mean": 4.358440742180392e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16299.0, + "completions/mean_length": 7770.5859375, + "completions/mean_terminated_length": 7346.97509765625, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "entropy": 1.0466903448104858, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004189736675471067, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 286935512.0, + "reward": 0.3828125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.011683559976518154, + "sampling/sampling_logp_difference/max": 4.449572563171387, + "sampling/sampling_logp_difference/mean": 0.021805983036756516, + "step": 329 + }, + { + "clip_ratio/high_max": 2.0567378214764176e-05, + "clip_ratio/high_mean": 5.141844553691044e-06, + "clip_ratio/low_mean": 1.8177100628236076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3318944840866607e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15758.0, + "completions/mean_length": 5689.2421875, + "completions/mean_terminated_length": 5432.568359375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.7778806164860725, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032866497058421373, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 287681943.0, + "reward": 0.640625, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940812587738, + "sampling/importance_sampling_ratio/min": 0.00038077132194302976, + "sampling/sampling_logp_difference/max": 7.873311519622803, + "sampling/sampling_logp_difference/mean": 0.01789461076259613, + "step": 330 + }, + { + "clip_ratio/high_max": 3.109086901531555e-05, + "clip_ratio/high_mean": 7.772717253828887e-06, + "clip_ratio/low_mean": 3.1423560130861006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919627738468989e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13820.0, + "completions/mean_length": 6288.1875, + "completions/mean_terminated_length": 6127.93701171875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.7709921672940254, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023572889622300863, + "learning_rate": 1e-05, + "loss": 0.0746, + "num_tokens": 288506735.0, + "reward": 0.484375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 0.000430915504693985, + "sampling/sampling_logp_difference/max": 7.749598503112793, + "sampling/sampling_logp_difference/mean": 0.017407266423106194, + "step": 331 + }, + { + "clip_ratio/high_max": 3.4638953366084024e-05, + "clip_ratio/high_mean": 9.51674803673086e-06, + "clip_ratio/low_mean": 6.26047980176736e-05, + "clip_ratio/low_min": 5.51267930859467e-06, + "clip_ratio/region_mean": 7.212154741864651e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 6775.0234375, + "completions/mean_terminated_length": 6465.05615234375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9338318258523941, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034220058005303144, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 289395498.0, + "reward": 0.390625, + "reward_std": 0.34533774852752686, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.0317598432302475, + "sampling/sampling_logp_difference/max": 3.449552536010742, + "sampling/sampling_logp_difference/mean": 0.019930530339479446, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.159989991123439e-05, + "clip_ratio/low_min": 1.5592839645250933e-05, + "clip_ratio/region_mean": 7.159989991123439e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 7142.9375, + "completions/mean_terminated_length": 6844.83837890625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.971405878663063, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002513247774913907, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 290329082.0, + "reward": 0.328125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 3.152207455059397e-07, + "sampling/sampling_logp_difference/max": 14.969992637634277, + "sampling/sampling_logp_difference/mean": 0.022366533055901527, + "step": 333 + }, + { + "clip_ratio/high_max": 1.6507752206962323e-05, + "clip_ratio/high_mean": 4.126938051740581e-06, + "clip_ratio/low_mean": 1.7493430505055585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1620368215735652e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15581.0, + "completions/mean_length": 6412.2109375, + "completions/mean_terminated_length": 6333.69287109375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "entropy": 0.9136044681072235, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0056767817586660385, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 291170133.0, + "reward": 0.421875, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999720454216003, + "sampling/importance_sampling_ratio/min": 0.000458698661532253, + "sampling/sampling_logp_difference/max": 7.687117099761963, + "sampling/sampling_logp_difference/mean": 0.020012658089399338, + "step": 334 + }, + { + "clip_ratio/high_max": 8.26085442895419e-06, + "clip_ratio/high_mean": 2.0652136072385474e-06, + "clip_ratio/low_mean": 3.6938338666914206e-05, + "clip_ratio/low_min": 5.699044777429663e-06, + "clip_ratio/region_mean": 3.900355193309224e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16111.0, + "completions/mean_length": 8066.1015625, + "completions/mean_terminated_length": 7797.7822265625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 1.0789504647254944, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00243841833434999, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 292222082.0, + "reward": 0.3046875, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999664425849915, + "sampling/importance_sampling_ratio/min": 8.481895929435268e-05, + "sampling/sampling_logp_difference/max": 9.374991416931152, + "sampling/sampling_logp_difference/mean": 0.023650091141462326, + "step": 335 + }, + { + "clip_ratio/high_max": 5.320054697222076e-06, + "clip_ratio/high_mean": 1.330013674305519e-06, + "clip_ratio/low_mean": 1.9117383317279746e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0447396991585265e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15176.0, + "completions/mean_length": 6836.046875, + "completions/mean_terminated_length": 6606.896484375, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 1.218759760260582, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020856577903032303, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 293115984.0, + "reward": 0.21875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 2.784526441246271e-05, + "sampling/sampling_logp_difference/max": 10.488847732543945, + "sampling/sampling_logp_difference/mean": 0.022012067958712578, + "step": 336 + }, + { + "clip_ratio/high_max": 2.5695502699818462e-05, + "clip_ratio/high_mean": 7.549717793153832e-06, + "clip_ratio/low_mean": 4.6741323160404136e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.429104089671455e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7501.9921875, + "completions/mean_terminated_length": 7140.9345703125, + "completions/min_length": 1237.0, + "completions/min_terminated_length": 1237.0, + "entropy": 0.8940394818782806, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005163854919373989, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 294099503.0, + "reward": 0.328125, + "reward_std": 0.30904707312583923, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999276399612427, + "sampling/importance_sampling_ratio/min": 0.0006545600481331348, + "sampling/sampling_logp_difference/max": 7.331547260284424, + "sampling/sampling_logp_difference/mean": 0.020813245326280594, + "step": 337 + }, + { + "clip_ratio/high_max": 3.1606674838258186e-05, + "clip_ratio/high_mean": 9.45794374729303e-06, + "clip_ratio/low_mean": 4.5567895540443715e-05, + "clip_ratio/low_min": 4.458871444512624e-06, + "clip_ratio/region_mean": 5.502583962879726e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7204.828125, + "completions/mean_terminated_length": 6908.7255859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.9961872175335884, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029277894645929337, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 295042105.0, + "reward": 0.390625, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000677108764648, + "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05, + "sampling/sampling_logp_difference/max": 10.872637748718262, + "sampling/sampling_logp_difference/mean": 0.020187582820653915, + "step": 338 + }, + { + "clip_ratio/high_max": 1.7963964182854397e-05, + "clip_ratio/high_mean": 5.194059781388205e-06, + "clip_ratio/low_mean": 1.8380221035840805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.357428081722901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15856.0, + "completions/mean_length": 6256.859375, + "completions/mean_terminated_length": 6013.80810546875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "entropy": 0.9293600022792816, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032952844630926847, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 295867039.0, + "reward": 0.46875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999649524688721, + "sampling/importance_sampling_ratio/min": 7.995560008566827e-05, + "sampling/sampling_logp_difference/max": 9.434039115905762, + "sampling/sampling_logp_difference/mean": 0.019491540268063545, + "step": 339 + }, + { + "clip_ratio/high_max": 7.577551059512189e-06, + "clip_ratio/high_mean": 1.8943877648780472e-06, + "clip_ratio/low_mean": 2.7479814093567256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9374201631071628e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15412.0, + "completions/mean_length": 7397.84375, + "completions/mean_terminated_length": 7032.552734375, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.8508890569210052, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029417150653898716, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 296832843.0, + "reward": 0.375, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000183582305908, + "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05, + "sampling/sampling_logp_difference/max": 10.93724250793457, + "sampling/sampling_logp_difference/mean": 0.01975393109023571, + "step": 340 + }, + { + "clip_ratio/high_max": 3.281225508544594e-05, + "clip_ratio/high_mean": 1.3302957199812226e-05, + "clip_ratio/low_mean": 5.109179869577929e-05, + "clip_ratio/low_min": 6.657612175331451e-06, + "clip_ratio/region_mean": 6.439475532715733e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6897.765625, + "completions/mean_terminated_length": 6823.07080078125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9046694040298462, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026788609102368355, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 297735285.0, + "reward": 0.421875, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.001710799871943891, + "sampling/sampling_logp_difference/max": 6.370794296264648, + "sampling/sampling_logp_difference/mean": 0.020578179508447647, + "step": 341 + }, + { + "clip_ratio/high_max": 1.7319889593636617e-05, + "clip_ratio/high_mean": 5.168538336874917e-06, + "clip_ratio/low_mean": 7.019768918326008e-05, + "clip_ratio/low_min": 2.541147478041239e-05, + "clip_ratio/region_mean": 7.53662266106403e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 6971.9921875, + "completions/mean_terminated_length": 6509.10595703125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8658201694488525, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005915141198784113, + "learning_rate": 1e-05, + "loss": 0.0923, + "num_tokens": 298645124.0, + "reward": 0.3984375, + "reward_std": 0.3742823898792267, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999268651008606, + "sampling/importance_sampling_ratio/min": 0.000970841443631798, + "sampling/sampling_logp_difference/max": 6.937347412109375, + "sampling/sampling_logp_difference/mean": 0.01906151883304119, + "step": 342 + }, + { + "clip_ratio/high_max": 1.8332865238335216e-05, + "clip_ratio/high_mean": 4.583216309583804e-06, + "clip_ratio/low_mean": 6.167940273371642e-05, + "clip_ratio/low_min": 5.969151516183047e-06, + "clip_ratio/region_mean": 6.626261847486603e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15054.0, + "completions/mean_length": 6545.6953125, + "completions/mean_terminated_length": 5889.80859375, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.779609851539135, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032792428974062204, + "learning_rate": 1e-05, + "loss": 0.097, + "num_tokens": 299503781.0, + "reward": 0.609375, + "reward_std": 0.38293448090553284, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999361634254456, + "sampling/importance_sampling_ratio/min": 0.002187495119869709, + "sampling/sampling_logp_difference/max": 6.124998092651367, + "sampling/sampling_logp_difference/mean": 0.017413027584552765, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.46246323235755e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.46246323235755e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7226.515625, + "completions/mean_terminated_length": 7006.736328125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9573849961161613, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005092279519885778, + "learning_rate": 1e-05, + "loss": 0.1102, + "num_tokens": 300447903.0, + "reward": 0.5390625, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999373555183411, + "sampling/importance_sampling_ratio/min": 0.000627054600045085, + "sampling/sampling_logp_difference/max": 7.374476909637451, + "sampling/sampling_logp_difference/mean": 0.021570835262537003, + "step": 344 + }, + { + "clip_ratio/high_max": 5.487269390869187e-06, + "clip_ratio/high_mean": 1.3718173477172968e-06, + "clip_ratio/low_mean": 4.7280102080549113e-05, + "clip_ratio/low_min": 1.0166083029616857e-05, + "clip_ratio/region_mean": 4.865191931457957e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14967.0, + "completions/mean_length": 5755.171875, + "completions/mean_terminated_length": 5323.10546875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8482184633612633, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005033228080719709, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 301206021.0, + "reward": 0.390625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.0014573346124961972, + "sampling/sampling_logp_difference/max": 6.531146049499512, + "sampling/sampling_logp_difference/mean": 0.018870476633310318, + "step": 345 + }, + { + "clip_ratio/high_max": 5.421346941147931e-06, + "clip_ratio/high_mean": 1.3553367352869827e-06, + "clip_ratio/low_mean": 1.6510994441887306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.786633117717429e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 7098.7265625, + "completions/mean_terminated_length": 6875.88037109375, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "entropy": 0.87320177257061, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007659573573619127, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 302133890.0, + "reward": 0.421875, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0012466582702472806, + "sampling/sampling_logp_difference/max": 6.687288761138916, + "sampling/sampling_logp_difference/mean": 0.019994346424937248, + "step": 346 + }, + { + "clip_ratio/high_max": 1.1556229310372146e-05, + "clip_ratio/high_mean": 2.8890573275930365e-06, + "clip_ratio/low_mean": 3.8744643916288624e-05, + "clip_ratio/low_min": 6.108287834649673e-06, + "clip_ratio/region_mean": 4.1633702039689524e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16139.0, + "completions/mean_length": 6399.96875, + "completions/mean_terminated_length": 6077.90283203125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9481896534562111, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014135175151750445, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 302972566.0, + "reward": 0.4140625, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0025698256213217974, + "sampling/sampling_logp_difference/max": 5.963917255401611, + "sampling/sampling_logp_difference/mean": 0.02073008380830288, + "step": 347 + }, + { + "clip_ratio/high_max": 6.59491388432798e-06, + "clip_ratio/high_mean": 2.545892130001448e-06, + "clip_ratio/low_mean": 4.620846755187813e-05, + "clip_ratio/low_min": 6.243132702365983e-06, + "clip_ratio/region_mean": 4.875435956819274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 7298.078125, + "completions/mean_terminated_length": 7226.53564453125, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "entropy": 0.8719206526875496, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027898226398974657, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 303925976.0, + "reward": 0.484375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.005236432887613773, + "sampling/sampling_logp_difference/max": 5.252114772796631, + "sampling/sampling_logp_difference/mean": 0.020944103598594666, + "step": 348 + }, + { + "clip_ratio/high_max": 1.052124343914329e-05, + "clip_ratio/high_mean": 2.6303108597858227e-06, + "clip_ratio/low_mean": 2.010384196182713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.273415248055244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14980.0, + "completions/mean_length": 5667.0390625, + "completions/mean_terminated_length": 5496.9287109375, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "entropy": 0.8791451379656792, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012764945859089494, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 304675157.0, + "reward": 0.390625, + "reward_std": 0.17965976893901825, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000383853912354, + "sampling/importance_sampling_ratio/min": 5.054428584116977e-06, + "sampling/sampling_logp_difference/max": 12.195245742797852, + "sampling/sampling_logp_difference/mean": 0.018928447738289833, + "step": 349 + }, + { + "clip_ratio/high_max": 9.578045592206763e-06, + "clip_ratio/high_mean": 2.3945113980516908e-06, + "clip_ratio/low_mean": 3.1114799753595435e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350931149270764e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15354.0, + "completions/max_terminated_length": 15354.0, + "completions/mean_length": 5874.4453125, + "completions/mean_terminated_length": 5874.4453125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9577538818120956, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00509974779561162, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 305447038.0, + "reward": 0.515625, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999423027038574, + "sampling/importance_sampling_ratio/min": 0.004791648127138615, + "sampling/sampling_logp_difference/max": 5.340880870819092, + "sampling/sampling_logp_difference/mean": 0.02114470861852169, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0903062275247066e-05, + "clip_ratio/high_mean": 2.7257655688117666e-06, + "clip_ratio/low_mean": 4.784364205079328e-05, + "clip_ratio/low_min": 3.861600362142781e-06, + "clip_ratio/region_mean": 5.056940744907479e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 6197.5703125, + "completions/mean_terminated_length": 6035.88134765625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.8665244281291962, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030849494505673647, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 306258023.0, + "reward": 0.515625, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998056888580322, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.021017421036958694, + "step": 351 + }, + { + "clip_ratio/high_max": 1.4299712574938894e-05, + "clip_ratio/high_mean": 4.3520980170796975e-06, + "clip_ratio/low_mean": 6.213493452378316e-05, + "clip_ratio/low_min": 1.0056635801447555e-05, + "clip_ratio/region_mean": 6.648703174505499e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7522.578125, + "completions/mean_terminated_length": 7381.9208984375, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.8185881152749062, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002946985885500908, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 307240305.0, + "reward": 0.3125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.005127199459820986, + "sampling/sampling_logp_difference/max": 5.273195743560791, + "sampling/sampling_logp_difference/mean": 0.01965932548046112, + "step": 352 + }, + { + "clip_ratio/high_max": 1.693051035545068e-05, + "clip_ratio/high_mean": 5.08456730585749e-06, + "clip_ratio/low_mean": 4.2052345861520735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.713691282631771e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14090.0, + "completions/mean_length": 6403.2265625, + "completions/mean_terminated_length": 6163.6884765625, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "entropy": 0.8359840363264084, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031181599479168653, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 308079318.0, + "reward": 0.5, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999215602874756, + "sampling/importance_sampling_ratio/min": 6.73715621815063e-05, + "sampling/sampling_logp_difference/max": 9.605287551879883, + "sampling/sampling_logp_difference/mean": 0.01963040418922901, + "step": 353 + }, + { + "clip_ratio/high_max": 1.3988919135954347e-05, + "clip_ratio/high_mean": 3.497229783988587e-06, + "clip_ratio/low_mean": 6.722658486069122e-05, + "clip_ratio/low_min": 1.858519090092159e-05, + "clip_ratio/region_mean": 7.072381458783639e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7954.03125, + "completions/mean_terminated_length": 7751.71240234375, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.905990719795227, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002656223252415657, + "learning_rate": 1e-05, + "loss": 0.1022, + "num_tokens": 309117770.0, + "reward": 0.3828125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999536275863647, + "sampling/importance_sampling_ratio/min": 0.0003354826185386628, + "sampling/sampling_logp_difference/max": 7.999940395355225, + "sampling/sampling_logp_difference/mean": 0.020741507411003113, + "step": 354 + }, + { + "clip_ratio/high_max": 1.7610595023143105e-05, + "clip_ratio/high_mean": 4.402648755785776e-06, + "clip_ratio/low_mean": 4.337988764291367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.778253651238629e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6630.09375, + "completions/mean_terminated_length": 6315.45166015625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.870736837387085, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0060529084876179695, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 309988894.0, + "reward": 0.515625, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998822212219238, + "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05, + "sampling/sampling_logp_difference/max": 10.716434478759766, + "sampling/sampling_logp_difference/mean": 0.02060208097100258, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0448093235027045e-05, + "clip_ratio/high_mean": 2.6120233087567613e-06, + "clip_ratio/low_mean": 3.1030769946482906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364279325523967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15920.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6679.6171875, + "completions/mean_terminated_length": 6679.6171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9812518879771233, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00400698184967041, + "learning_rate": 1e-05, + "loss": 0.0605, + "num_tokens": 310864013.0, + "reward": 0.421875, + "reward_std": 0.3295465111732483, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999049305915833, + "sampling/importance_sampling_ratio/min": 0.0020593837834894657, + "sampling/sampling_logp_difference/max": 6.1853485107421875, + "sampling/sampling_logp_difference/mean": 0.02098071575164795, + "step": 356 + }, + { + "clip_ratio/high_max": 2.124982574969181e-05, + "clip_ratio/high_mean": 7.736592579021817e-06, + "clip_ratio/low_mean": 2.900951585615985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.674610888992902e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14541.0, + "completions/mean_length": 5523.796875, + "completions/mean_terminated_length": 5173.4677734375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9120645374059677, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005929585546255112, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 311589987.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998446702957153, + "sampling/importance_sampling_ratio/min": 0.0010661041596904397, + "sampling/sampling_logp_difference/max": 6.843744277954102, + "sampling/sampling_logp_difference/mean": 0.019948206841945648, + "step": 357 + }, + { + "clip_ratio/high_max": 2.4486997745043482e-05, + "clip_ratio/high_mean": 8.219769085826556e-06, + "clip_ratio/low_mean": 5.346400575945154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.168377467474784e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15401.0, + "completions/mean_length": 6361.3671875, + "completions/mean_terminated_length": 6282.44873046875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.8044678047299385, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006622390355914831, + "learning_rate": 1e-05, + "loss": 0.1023, + "num_tokens": 312424034.0, + "reward": 0.5078125, + "reward_std": 0.3724474310874939, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.0003157092141918838, + "sampling/sampling_logp_difference/max": 8.060688972473145, + "sampling/sampling_logp_difference/mean": 0.018907658755779266, + "step": 358 + }, + { + "clip_ratio/high_max": 1.0407376748844399e-05, + "clip_ratio/high_mean": 2.6018441872110998e-06, + "clip_ratio/low_mean": 5.925514369664597e-05, + "clip_ratio/low_min": 1.3324347946763737e-05, + "clip_ratio/region_mean": 6.185698703120579e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 7109.0, + "completions/mean_terminated_length": 7035.96826171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9167275875806808, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004639944992959499, + "learning_rate": 1e-05, + "loss": 0.0861, + "num_tokens": 313353346.0, + "reward": 0.4140625, + "reward_std": 0.3826971650123596, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999389052391052, + "sampling/importance_sampling_ratio/min": 0.0019070414127781987, + "sampling/sampling_logp_difference/max": 6.262202262878418, + "sampling/sampling_logp_difference/mean": 0.02155841514468193, + "step": 359 + }, + { + "clip_ratio/high_max": 3.959046694035351e-05, + "clip_ratio/high_mean": 1.0912523691786191e-05, + "clip_ratio/low_mean": 3.3944450819944905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.485697365907981e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6314.2734375, + "completions/mean_terminated_length": 6072.60009765625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.8780038207769394, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007643720600754023, + "learning_rate": 1e-05, + "loss": 0.0873, + "num_tokens": 314180717.0, + "reward": 0.4609375, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999802112579346, + "sampling/importance_sampling_ratio/min": 0.021285315975546837, + "sampling/sampling_logp_difference/max": 3.8497378826141357, + "sampling/sampling_logp_difference/mean": 0.01964358240365982, + "step": 360 + }, + { + "clip_ratio/high_max": 3.065382111344661e-05, + "clip_ratio/high_mean": 9.187473835936544e-06, + "clip_ratio/low_mean": 4.137891801292426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.056639065514901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6718.2265625, + "completions/mean_terminated_length": 6486.24853515625, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8326799497008324, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050973957404494286, + "learning_rate": 1e-05, + "loss": 0.0109, + "num_tokens": 315060842.0, + "reward": 0.5078125, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.0009130688849836588, + "sampling/sampling_logp_difference/max": 6.998699188232422, + "sampling/sampling_logp_difference/mean": 0.019501537084579468, + "step": 361 + }, + { + "clip_ratio/high_max": 8.624853762739804e-06, + "clip_ratio/high_mean": 2.156213440684951e-06, + "clip_ratio/low_mean": 1.8797969062234188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0954182048171788e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 8666.8359375, + "completions/mean_terminated_length": 7941.291015625, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.9526705741882324, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019092690199613571, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 316190325.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999814629554749, + "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05, + "sampling/sampling_logp_difference/max": 10.249995231628418, + "sampling/sampling_logp_difference/mean": 0.02051631174981594, + "step": 362 + }, + { + "clip_ratio/high_max": 2.147400391550036e-05, + "clip_ratio/high_mean": 6.434908300434472e-06, + "clip_ratio/low_mean": 3.521234066283796e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.164724816746457e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15164.0, + "completions/mean_length": 7661.8203125, + "completions/mean_terminated_length": 7002.16015625, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.8322782590985298, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019530428107827902, + "learning_rate": 1e-05, + "loss": 0.0729, + "num_tokens": 317191878.0, + "reward": 0.4609375, + "reward_std": 0.21382391452789307, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 8.546619210392237e-05, + "sampling/sampling_logp_difference/max": 9.367389678955078, + "sampling/sampling_logp_difference/mean": 0.019894573837518692, + "step": 363 + }, + { + "clip_ratio/high_max": 1.9436202364886412e-05, + "clip_ratio/high_mean": 6.089704697842535e-06, + "clip_ratio/low_mean": 4.2698405422925134e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.878810955233348e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 7024.859375, + "completions/mean_terminated_length": 6800.240234375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.794853538274765, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031784537713974714, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 318109004.0, + "reward": 0.4921875, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352693557739, + "sampling/importance_sampling_ratio/min": 0.0002962362195830792, + "sampling/sampling_logp_difference/max": 8.124353408813477, + "sampling/sampling_logp_difference/mean": 0.018519200384616852, + "step": 364 + }, + { + "clip_ratio/high_max": 4.127455667912727e-06, + "clip_ratio/high_mean": 1.0318639169781818e-06, + "clip_ratio/low_mean": 4.342453667049995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.445640047379129e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 7282.1796875, + "completions/mean_terminated_length": 6912.1865234375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.904067650437355, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005080109462141991, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 319059075.0, + "reward": 0.4140625, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062108039856, + "sampling/importance_sampling_ratio/min": 0.1194523349404335, + "sampling/sampling_logp_difference/max": 6.136754989624023, + "sampling/sampling_logp_difference/mean": 0.019978653639554977, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.608940076243016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.608940076243016e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15625.0, + "completions/mean_length": 7131.5234375, + "completions/mean_terminated_length": 6596.255859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.8849587142467499, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022667953744530678, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 319990046.0, + "reward": 0.46875, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0370909757912159, + "sampling/sampling_logp_difference/max": 3.294381618499756, + "sampling/sampling_logp_difference/mean": 0.02037571743130684, + "step": 366 + }, + { + "clip_ratio/high_max": 1.5356635913121863e-05, + "clip_ratio/high_mean": 3.839158978280466e-06, + "clip_ratio/low_mean": 3.4950805911648786e-05, + "clip_ratio/low_min": 4.876336333836662e-06, + "clip_ratio/region_mean": 3.8789965287833184e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 6655.4453125, + "completions/mean_terminated_length": 6578.84228515625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.7417122721672058, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00216497085057199, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 320860135.0, + "reward": 0.5625, + "reward_std": 0.3369230031967163, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0005190494703128934, + "sampling/sampling_logp_difference/max": 7.563511371612549, + "sampling/sampling_logp_difference/mean": 0.01771342009305954, + "step": 367 + }, + { + "clip_ratio/high_max": 1.7605634639039636e-05, + "clip_ratio/high_mean": 5.297029474604642e-06, + "clip_ratio/low_mean": 5.688933060810086e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.218636053745286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15849.0, + "completions/mean_length": 7077.1640625, + "completions/mean_terminated_length": 6619.45068359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.8749325424432755, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0028338562697172165, + "learning_rate": 1e-05, + "loss": 0.0643, + "num_tokens": 321783852.0, + "reward": 0.3828125, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998220205307007, + "sampling/importance_sampling_ratio/min": 7.83290306571871e-06, + "sampling/sampling_logp_difference/max": 11.757177352905273, + "sampling/sampling_logp_difference/mean": 0.020299233496189117, + "step": 368 + }, + { + "clip_ratio/high_max": 7.301828190975357e-06, + "clip_ratio/high_mean": 1.8254570477438392e-06, + "clip_ratio/low_mean": 5.158197632226802e-05, + "clip_ratio/low_min": 3.735804057214409e-06, + "clip_ratio/region_mean": 5.340743223314348e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6034.296875, + "completions/mean_terminated_length": 5525.294921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.80014718323946, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022897711023688316, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 322572882.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999347925186157, + "sampling/importance_sampling_ratio/min": 0.0004105660773348063, + "sampling/sampling_logp_difference/max": 7.7979736328125, + "sampling/sampling_logp_difference/mean": 0.01858348958194256, + "step": 369 + }, + { + "clip_ratio/high_max": 9.364057859784225e-06, + "clip_ratio/high_mean": 3.351393047523743e-06, + "clip_ratio/low_mean": 4.186752630630508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5218919240141986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 8172.109375, + "completions/mean_terminated_length": 7838.29248046875, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "entropy": 0.8732693120837212, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003263789461925626, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 323640904.0, + "reward": 0.2890625, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999354481697083, + "sampling/importance_sampling_ratio/min": 9.27252222027164e-06, + "sampling/sampling_logp_difference/max": 11.588455200195312, + "sampling/sampling_logp_difference/mean": 0.0208889190107584, + "step": 370 + }, + { + "clip_ratio/high_max": 2.0998899799451465e-05, + "clip_ratio/high_mean": 6.692962131182867e-06, + "clip_ratio/low_mean": 4.261424010110204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930720297124935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 7699.203125, + "completions/mean_terminated_length": 7419.04833984375, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.8296505436301231, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0042716520838439465, + "learning_rate": 1e-05, + "loss": 0.0937, + "num_tokens": 324643858.0, + "reward": 0.4921875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874234199524, + "sampling/importance_sampling_ratio/min": 0.00022192654432728887, + "sampling/sampling_logp_difference/max": 8.413164138793945, + "sampling/sampling_logp_difference/mean": 0.018926654011011124, + "step": 371 + }, + { + "clip_ratio/high_max": 7.061349151626928e-06, + "clip_ratio/high_mean": 1.765337287906732e-06, + "clip_ratio/low_mean": 4.5005243464402156e-05, + "clip_ratio/low_min": 3.861838649754645e-06, + "clip_ratio/region_mean": 4.6770580411248375e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 7450.1640625, + "completions/mean_terminated_length": 7450.1640625, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 1.0400195196270943, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033558050636202097, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 325617687.0, + "reward": 0.2578125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999459385871887, + "sampling/importance_sampling_ratio/min": 0.039920732378959656, + "sampling/sampling_logp_difference/max": 3.2208595275878906, + "sampling/sampling_logp_difference/mean": 0.02249298244714737, + "step": 372 + }, + { + "clip_ratio/high_max": 1.3147802746971138e-05, + "clip_ratio/high_mean": 3.2869506867427845e-06, + "clip_ratio/low_mean": 2.4451034505545977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7737984851228248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15342.0, + "completions/mean_length": 6799.0703125, + "completions/mean_terminated_length": 6723.5986328125, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9737623482942581, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005797459278255701, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 326508384.0, + "reward": 0.3125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321699142456, + "sampling/importance_sampling_ratio/min": 7.535634836131067e-07, + "sampling/sampling_logp_difference/max": 14.0984525680542, + "sampling/sampling_logp_difference/mean": 0.021543748676776886, + "step": 373 + }, + { + "clip_ratio/high_max": 3.3594023989280686e-06, + "clip_ratio/high_mean": 8.398505997320171e-07, + "clip_ratio/low_mean": 2.3457610382138228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4297460981870245e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 7034.3671875, + "completions/mean_terminated_length": 6654.30078125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8749603256583214, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002258980879560113, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 327426407.0, + "reward": 0.4609375, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.008719252422451973, + "sampling/sampling_logp_difference/max": 4.742221832275391, + "sampling/sampling_logp_difference/mean": 0.01997346058487892, + "step": 374 + }, + { + "clip_ratio/high_max": 2.823375348270929e-05, + "clip_ratio/high_mean": 7.058438370677322e-06, + "clip_ratio/low_mean": 4.9395109726901865e-05, + "clip_ratio/low_min": 1.636556044104509e-05, + "clip_ratio/region_mean": 5.6453548268109444e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15240.0, + "completions/mean_length": 6623.078125, + "completions/mean_terminated_length": 6388.81640625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.858784057199955, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002420129720121622, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 328292985.0, + "reward": 0.4140625, + "reward_std": 0.3077537417411804, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 0.00014900295354891568, + "sampling/sampling_logp_difference/max": 8.811544418334961, + "sampling/sampling_logp_difference/mean": 0.019645996391773224, + "step": 375 + }, + { + "clip_ratio/high_max": 1.8078507309837732e-05, + "clip_ratio/high_mean": 6.468551191574079e-06, + "clip_ratio/low_mean": 4.051302585139638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.698157727034413e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15229.0, + "completions/mean_length": 5902.4765625, + "completions/mean_terminated_length": 5564.36279296875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.904740035533905, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004107976797968149, + "learning_rate": 1e-05, + "loss": 0.0824, + "num_tokens": 329067006.0, + "reward": 0.5546875, + "reward_std": 0.3945493996143341, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05, + "sampling/sampling_logp_difference/max": 11.37439250946045, + "sampling/sampling_logp_difference/mean": 0.019582755863666534, + "step": 376 + }, + { + "clip_ratio/high_max": 2.553658168835682e-05, + "clip_ratio/high_mean": 7.276365181496658e-06, + "clip_ratio/low_mean": 1.7552573126522475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.482893796695862e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6425.6015625, + "completions/mean_terminated_length": 6267.5322265625, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.964553713798523, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003208522219210863, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 329910691.0, + "reward": 0.359375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999419450759888, + "sampling/importance_sampling_ratio/min": 0.00137569778598845, + "sampling/sampling_logp_difference/max": 6.588794231414795, + "sampling/sampling_logp_difference/mean": 0.021154657006263733, + "step": 377 + }, + { + "clip_ratio/high_max": 6.8712420215888415e-06, + "clip_ratio/high_mean": 1.7178105053972104e-06, + "clip_ratio/low_mean": 4.0991827404468495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2709637853022286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 8006.4453125, + "completions/mean_terminated_length": 7594.43408203125, + "completions/min_length": 1235.0, + "completions/min_terminated_length": 1235.0, + "entropy": 0.8980336412787437, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002898421371355653, + "learning_rate": 1e-05, + "loss": 0.0815, + "num_tokens": 330956332.0, + "reward": 0.4296875, + "reward_std": 0.20175684988498688, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 9.378339746035635e-05, + "sampling/sampling_logp_difference/max": 9.27452278137207, + "sampling/sampling_logp_difference/mean": 0.021021340042352676, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2689344689297286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2689344689297286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15484.0, + "completions/max_terminated_length": 15484.0, + "completions/mean_length": 7068.828125, + "completions/mean_terminated_length": 7068.828125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.9865007549524307, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0037063576746731997, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 331880918.0, + "reward": 0.3203125, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0001819290773710236, + "sampling/sampling_logp_difference/max": 8.611893653869629, + "sampling/sampling_logp_difference/mean": 0.02072504535317421, + "step": 379 + }, + { + "clip_ratio/high_max": 5.845633268108941e-06, + "clip_ratio/high_mean": 1.4614083170272352e-06, + "clip_ratio/low_mean": 3.207486906831036e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353627721480734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 7379.390625, + "completions/mean_terminated_length": 7236.4609375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.8977236375212669, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001972826896235347, + "learning_rate": 1e-05, + "loss": 0.0228, + "num_tokens": 332849112.0, + "reward": 0.4140625, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 2.820451663865242e-05, + "sampling/sampling_logp_difference/max": 10.476028442382812, + "sampling/sampling_logp_difference/mean": 0.019411223009228706, + "step": 380 + }, + { + "clip_ratio/high_max": 4.875385002378607e-06, + "clip_ratio/high_mean": 1.2188462505946518e-06, + "clip_ratio/low_mean": 2.3530714997832547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.47495612484272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15517.0, + "completions/mean_length": 6867.9609375, + "completions/mean_terminated_length": 6793.03125, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "entropy": 0.9244343340396881, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.006926023401319981, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333746179.0, + "reward": 0.4140625, + "reward_std": 0.1433562934398651, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.0003875594411510974, + "sampling/sampling_logp_difference/max": 7.8556413650512695, + "sampling/sampling_logp_difference/mean": 0.020311862230300903, + "step": 381 + }, + { + "clip_ratio/high_max": 1.5651628245905158e-05, + "clip_ratio/high_mean": 4.836261211949022e-06, + "clip_ratio/low_mean": 5.268017821435933e-05, + "clip_ratio/low_min": 3.950945028918795e-06, + "clip_ratio/region_mean": 5.751643902840442e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 7525.375, + "completions/mean_terminated_length": 6855.3955078125, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9207312315702438, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0047226278111338615, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 334731027.0, + "reward": 0.3359375, + "reward_std": 0.3353874683380127, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999615550041199, + "sampling/importance_sampling_ratio/min": 0.00029753465787507594, + "sampling/sampling_logp_difference/max": 8.119979858398438, + "sampling/sampling_logp_difference/mean": 0.021496692672371864, + "step": 382 + }, + { + "clip_ratio/high_max": 3.815379886873416e-05, + "clip_ratio/high_mean": 9.53844971718354e-06, + "clip_ratio/low_mean": 4.519663821156428e-05, + "clip_ratio/low_min": 2.775434040813707e-06, + "clip_ratio/region_mean": 5.473508826980833e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16251.0, + "completions/mean_length": 6841.0625, + "completions/mean_terminated_length": 6453.13818359375, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.8979457840323448, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004971448332071304, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 335631243.0, + "reward": 0.390625, + "reward_std": 0.2596156895160675, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999934196472168, + "sampling/importance_sampling_ratio/min": 9.655764188210014e-06, + "sampling/sampling_logp_difference/max": 11.547955513000488, + "sampling/sampling_logp_difference/mean": 0.020256079733371735, + "step": 383 + }, + { + "clip_ratio/high_max": 4.162365712545579e-06, + "clip_ratio/high_mean": 1.0405914281363948e-06, + "clip_ratio/low_mean": 3.1563491688757495e-05, + "clip_ratio/low_min": 3.1228139505401487e-06, + "clip_ratio/region_mean": 3.260408311689389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15060.0, + "completions/mean_length": 6919.8046875, + "completions/mean_terminated_length": 6454.35205078125, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9241961911320686, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038604787550866604, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 336537162.0, + "reward": 0.375, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998080730438232, + "sampling/importance_sampling_ratio/min": 0.0009118975722230971, + "sampling/sampling_logp_difference/max": 6.999982833862305, + "sampling/sampling_logp_difference/mean": 0.02030865103006363, + "step": 384 + }, + { + "clip_ratio/high_max": 6.5182248363271356e-06, + "clip_ratio/high_mean": 1.6295562090817839e-06, + "clip_ratio/low_mean": 4.3847362121596234e-05, + "clip_ratio/low_min": 6.294533704931382e-06, + "clip_ratio/region_mean": 4.547691833067802e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15692.0, + "completions/mean_length": 7679.390625, + "completions/mean_terminated_length": 7099.08349609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 1.0165777206420898, + "epoch": 0.35418583256669733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004624314606189728, + "learning_rate": 1e-05, + "loss": 0.0849, + "num_tokens": 337542492.0, + "reward": 0.3046875, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999251961708069, + "sampling/importance_sampling_ratio/min": 5.83546279813163e-05, + "sampling/sampling_logp_difference/max": 9.748971939086914, + "sampling/sampling_logp_difference/mean": 0.02206476218998432, + "step": 385 + }, + { + "clip_ratio/high_max": 6.00499606662197e-06, + "clip_ratio/high_mean": 1.5012490166554926e-06, + "clip_ratio/low_mean": 3.392923713363416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.543048615028965e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 5957.5859375, + "completions/mean_terminated_length": 5792.08740234375, + "completions/min_length": 1705.0, + "completions/min_terminated_length": 1705.0, + "entropy": 0.7705951780080795, + "epoch": 0.35510579576816925, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021966886706650257, + "learning_rate": 1e-05, + "loss": 0.0789, + "num_tokens": 338324279.0, + "reward": 0.53125, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999998927116394, + "sampling/importance_sampling_ratio/min": 0.0008041196851991117, + "sampling/sampling_logp_difference/max": 7.125762462615967, + "sampling/sampling_logp_difference/mean": 0.01804077997803688, + "step": 386 + }, + { + "clip_ratio/high_max": 1.5711350215497077e-05, + "clip_ratio/high_mean": 3.927837553874269e-06, + "clip_ratio/low_mean": 5.276240381135722e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.669024130838807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7269.8046875, + "completions/mean_terminated_length": 7198.03955078125, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 1.0025205165147781, + "epoch": 0.3560257589696412, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001694107661023736, + "learning_rate": 1e-05, + "loss": 0.134, + "num_tokens": 339274662.0, + "reward": 0.3359375, + "reward_std": 0.30487072467803955, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039769172668, + "sampling/importance_sampling_ratio/min": 0.0015677008777856827, + "sampling/sampling_logp_difference/max": 6.4581451416015625, + "sampling/sampling_logp_difference/mean": 0.021742526441812515, + "step": 387 + }, + { + "clip_ratio/high_max": 7.005848829066963e-06, + "clip_ratio/high_mean": 1.7514622072667407e-06, + "clip_ratio/low_mean": 5.100632029098051e-05, + "clip_ratio/low_min": 8.934973720897688e-06, + "clip_ratio/region_mean": 5.275778244140383e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7643.8359375, + "completions/mean_terminated_length": 7288.54443359375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "entropy": 0.7936615869402885, + "epoch": 0.35694572217111314, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004587972536683083, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 340272689.0, + "reward": 0.5078125, + "reward_std": 0.35324612259864807, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999613761901855, + "sampling/importance_sampling_ratio/min": 0.0007390327518805861, + "sampling/sampling_logp_difference/max": 7.210168361663818, + "sampling/sampling_logp_difference/mean": 0.01862112432718277, + "step": 388 + }, + { + "clip_ratio/high_max": 1.0522736374696251e-05, + "clip_ratio/high_mean": 2.6306840936740628e-06, + "clip_ratio/low_mean": 2.139122614153166e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4021910121518886e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14401.0, + "completions/mean_length": 7068.734375, + "completions/mean_terminated_length": 6610.60595703125, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "entropy": 0.8858344480395317, + "epoch": 0.3578656853725851, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00245783943682909, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 341195599.0, + "reward": 0.4609375, + "reward_std": 0.21594557166099548, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957263469696, + "sampling/importance_sampling_ratio/min": 1.526316918898374e-05, + "sampling/sampling_logp_difference/max": 11.090067863464355, + "sampling/sampling_logp_difference/mean": 0.019989900290966034, + "step": 389 + }, + { + "clip_ratio/high_max": 5.272259386401856e-06, + "clip_ratio/high_mean": 1.318064846600464e-06, + "clip_ratio/low_mean": 2.2939096254503966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4257160987417592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15788.0, + "completions/mean_length": 6093.296875, + "completions/mean_terminated_length": 5929.95263671875, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.9640207663178444, + "epoch": 0.35878564857405704, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0067657483741641045, + "learning_rate": 1e-05, + "loss": 0.0181, + "num_tokens": 341993565.0, + "reward": 0.4453125, + "reward_std": 0.12415502220392227, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998992681503296, + "sampling/importance_sampling_ratio/min": 0.010459281504154205, + "sampling/sampling_logp_difference/max": 4.56026554107666, + "sampling/sampling_logp_difference/mean": 0.02037961222231388, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.566248594528588e-05, + "clip_ratio/low_min": 4.402028480399167e-06, + "clip_ratio/region_mean": 4.566248594528588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16170.0, + "completions/max_terminated_length": 16170.0, + "completions/mean_length": 7620.09375, + "completions/mean_terminated_length": 7620.09375, + "completions/min_length": 1076.0, + "completions/min_terminated_length": 1076.0, + "entropy": 0.9773544892668724, + "epoch": 0.35970561177552896, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018817185191437602, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 342990545.0, + "reward": 0.3046875, + "reward_std": 0.18755048513412476, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0006883936002850533, + "sampling/sampling_logp_difference/max": 7.281149864196777, + "sampling/sampling_logp_difference/mean": 0.021528441458940506, + "step": 391 + }, + { + "clip_ratio/high_max": 2.6727505428425502e-05, + "clip_ratio/high_mean": 7.985045499481203e-06, + "clip_ratio/low_mean": 7.762144696243922e-05, + "clip_ratio/low_min": 2.4772080450929934e-05, + "clip_ratio/region_mean": 8.560649303035461e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15053.0, + "completions/mean_length": 6963.984375, + "completions/mean_terminated_length": 6737.904296875, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.9683744385838509, + "epoch": 0.36062557497700093, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052104732021689415, + "learning_rate": 1e-05, + "loss": 0.087, + "num_tokens": 343898791.0, + "reward": 0.4296875, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324679374695, + "sampling/importance_sampling_ratio/min": 0.010815954767167568, + "sampling/sampling_logp_difference/max": 4.526732921600342, + "sampling/sampling_logp_difference/mean": 0.021434593945741653, + "step": 392 + }, + { + "clip_ratio/high_max": 1.3545108686230378e-05, + "clip_ratio/high_mean": 4.365133804640209e-06, + "clip_ratio/low_mean": 2.5377692509209737e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9742826200163108e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15116.0, + "completions/mean_length": 6718.5078125, + "completions/mean_terminated_length": 6642.4013671875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9043834507465363, + "epoch": 0.36154553817847285, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005151392426341772, + "learning_rate": 1e-05, + "loss": 0.0085, + "num_tokens": 344779672.0, + "reward": 0.4921875, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999840497970581, + "sampling/importance_sampling_ratio/min": 0.0024171893019229174, + "sampling/sampling_logp_difference/max": 6.025149822235107, + "sampling/sampling_logp_difference/mean": 0.0201373603194952, + "step": 393 + }, + { + "clip_ratio/high_max": 1.2263486723895767e-05, + "clip_ratio/high_mean": 3.927679188109323e-06, + "clip_ratio/low_mean": 2.739263118201052e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132031042696326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 7044.640625, + "completions/mean_terminated_length": 6820.49609375, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "entropy": 0.9017335474491119, + "epoch": 0.3624655013799448, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026606651954352856, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 345701722.0, + "reward": 0.3125, + "reward_std": 0.24146249890327454, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05, + "sampling/sampling_logp_difference/max": 10.157968521118164, + "sampling/sampling_logp_difference/mean": 0.01981864869594574, + "step": 394 + }, + { + "clip_ratio/high_max": 1.026556356009678e-05, + "clip_ratio/high_mean": 2.566390890024195e-06, + "clip_ratio/low_mean": 4.819571529424138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0762106297952414e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15476.0, + "completions/mean_length": 6031.875, + "completions/mean_terminated_length": 5950.3623046875, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.8537683561444283, + "epoch": 0.36338546458141674, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003957017324864864, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 346492810.0, + "reward": 0.4296875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999707341194153, + "sampling/importance_sampling_ratio/min": 0.0015133036067709327, + "sampling/sampling_logp_difference/max": 6.493460178375244, + "sampling/sampling_logp_difference/mean": 0.018711457028985023, + "step": 395 + }, + { + "clip_ratio/high_max": 5.870488848813693e-06, + "clip_ratio/high_mean": 1.4676222122034233e-06, + "clip_ratio/low_mean": 3.637038832948747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.783801014378696e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 7429.3515625, + "completions/mean_terminated_length": 6911.31396484375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.8821266070008278, + "epoch": 0.36430542778288866, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002122648525983095, + "learning_rate": 1e-05, + "loss": 0.1257, + "num_tokens": 347462871.0, + "reward": 0.453125, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000076293945312, + "sampling/importance_sampling_ratio/min": 0.00014005196862854064, + "sampling/sampling_logp_difference/max": 8.873497009277344, + "sampling/sampling_logp_difference/mean": 0.01998838409781456, + "step": 396 + }, + { + "clip_ratio/high_max": 1.0663932243915042e-05, + "clip_ratio/high_mean": 2.6659830609787605e-06, + "clip_ratio/low_mean": 6.443337406381033e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.709935701110226e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15761.0, + "completions/mean_length": 7131.7109375, + "completions/mean_terminated_length": 6833.25, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.8575824722647667, + "epoch": 0.36522539098436063, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002546454081311822, + "learning_rate": 1e-05, + "loss": 0.0676, + "num_tokens": 348395842.0, + "reward": 0.4921875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999964714050293, + "sampling/importance_sampling_ratio/min": 0.0002167800412280485, + "sampling/sampling_logp_difference/max": 8.436627388000488, + "sampling/sampling_logp_difference/mean": 0.0193922221660614, + "step": 397 + }, + { + "clip_ratio/high_max": 3.847337666229578e-06, + "clip_ratio/high_mean": 9.618344165573944e-07, + "clip_ratio/low_mean": 3.932982110654848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.029165563679271e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16200.0, + "completions/mean_length": 6858.34375, + "completions/mean_terminated_length": 6707.14306640625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.9539813920855522, + "epoch": 0.36614535418583255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00492837093770504, + "learning_rate": 1e-05, + "loss": 0.0818, + "num_tokens": 349292790.0, + "reward": 0.390625, + "reward_std": 0.1949220597743988, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998850226402283, + "sampling/importance_sampling_ratio/min": 0.0011153683299198747, + "sampling/sampling_logp_difference/max": 6.79857063293457, + "sampling/sampling_logp_difference/mean": 0.020318543538451195, + "step": 398 + }, + { + "clip_ratio/high_max": 1.291372609557584e-05, + "clip_ratio/high_mean": 3.22843152389396e-06, + "clip_ratio/low_mean": 3.8245348378040944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1473780811429606e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15261.0, + "completions/mean_length": 7809.984375, + "completions/mean_terminated_length": 7533.40283203125, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.8353303670883179, + "epoch": 0.3670653173873045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004895905964076519, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 350312556.0, + "reward": 0.3203125, + "reward_std": 0.22567616403102875, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999260306358337, + "sampling/importance_sampling_ratio/min": 0.0008417933131568134, + "sampling/sampling_logp_difference/max": 7.0799760818481445, + "sampling/sampling_logp_difference/mean": 0.018754083663225174, + "step": 399 + }, + { + "clip_ratio/high_max": 1.1250081115576904e-05, + "clip_ratio/high_mean": 3.5690324011738994e-06, + "clip_ratio/low_mean": 3.196108968950284e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.553012152224255e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15057.0, + "completions/mean_length": 7194.9296875, + "completions/mean_terminated_length": 6821.39013671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9744522422552109, + "epoch": 0.36798528058877644, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0032397822942584753, + "learning_rate": 1e-05, + "loss": 0.0402, + "num_tokens": 351252755.0, + "reward": 0.421875, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998766183853149, + "sampling/importance_sampling_ratio/min": 0.00023159870761446655, + "sampling/sampling_logp_difference/max": 8.370504379272461, + "sampling/sampling_logp_difference/mean": 0.02105094864964485, + "step": 400 + }, + { + "clip_ratio/high_max": 6.980455509619787e-06, + "clip_ratio/high_mean": 1.7451138774049468e-06, + "clip_ratio/low_mean": 2.2670621888210007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.441573599298863e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 6836.234375, + "completions/mean_terminated_length": 6607.08837890625, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.9149863049387932, + "epoch": 0.3689052437902484, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031576494220644236, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 352145873.0, + "reward": 0.3671875, + "reward_std": 0.22225630283355713, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266862869263, + "sampling/importance_sampling_ratio/min": 0.0011975533561781049, + "sampling/sampling_logp_difference/max": 6.727474689483643, + "sampling/sampling_logp_difference/mean": 0.020445333793759346, + "step": 401 + }, + { + "clip_ratio/high_max": 2.3557336589874467e-05, + "clip_ratio/high_mean": 5.889334147468617e-06, + "clip_ratio/low_mean": 5.359988131203863e-05, + "clip_ratio/low_min": 1.3856095392839052e-05, + "clip_ratio/region_mean": 5.9489215118446737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 6942.65625, + "completions/mean_terminated_length": 6638.0966796875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.7541583999991417, + "epoch": 0.36982520699172033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003970830701291561, + "learning_rate": 1e-05, + "loss": 0.051, + "num_tokens": 353056405.0, + "reward": 0.453125, + "reward_std": 0.3282659649848938, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 8.399576472584158e-06, + "sampling/sampling_logp_difference/max": 11.687329292297363, + "sampling/sampling_logp_difference/mean": 0.018101349472999573, + "step": 402 + }, + { + "clip_ratio/high_max": 2.6139805413549766e-05, + "clip_ratio/high_mean": 7.517377525800839e-06, + "clip_ratio/low_mean": 1.968103515537223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7198412681173068e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14786.0, + "completions/max_terminated_length": 14786.0, + "completions/mean_length": 6022.1875, + "completions/mean_terminated_length": 6022.1875, + "completions/min_length": 1285.0, + "completions/min_terminated_length": 1285.0, + "entropy": 0.9535745903849602, + "epoch": 0.37074517019319225, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0043656788766384125, + "learning_rate": 1e-05, + "loss": 0.029, + "num_tokens": 353844661.0, + "reward": 0.4140625, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.04981832951307297, + "sampling/sampling_logp_difference/max": 2.9993722438812256, + "sampling/sampling_logp_difference/mean": 0.020655371248722076, + "step": 403 + }, + { + "clip_ratio/high_max": 9.152076700047473e-06, + "clip_ratio/high_mean": 2.9508817647183605e-06, + "clip_ratio/low_mean": 5.21388310517068e-05, + "clip_ratio/low_min": 2.633131089169183e-06, + "clip_ratio/region_mean": 5.508971298695542e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15906.0, + "completions/mean_length": 8068.96875, + "completions/mean_terminated_length": 7869.408203125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.9473539590835571, + "epoch": 0.3716651333946642, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006543307099491358, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 354894689.0, + "reward": 0.2578125, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 6.672408926533535e-05, + "sampling/sampling_logp_difference/max": 9.614944458007812, + "sampling/sampling_logp_difference/mean": 0.021852033212780952, + "step": 404 + }, + { + "clip_ratio/high_max": 2.9619268843816826e-05, + "clip_ratio/high_mean": 7.4048172109542065e-06, + "clip_ratio/low_mean": 5.5152235972855124e-05, + "clip_ratio/low_min": 1.0455875781190116e-05, + "clip_ratio/region_mean": 6.255705375224352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15748.0, + "completions/mean_length": 5960.1875, + "completions/mean_terminated_length": 5878.1103515625, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 0.9564141109585762, + "epoch": 0.37258509659613614, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003351036459207535, + "learning_rate": 1e-05, + "loss": 0.0293, + "num_tokens": 355677273.0, + "reward": 0.46875, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999220371246338, + "sampling/importance_sampling_ratio/min": 0.0012859756825491786, + "sampling/sampling_logp_difference/max": 6.656237602233887, + "sampling/sampling_logp_difference/mean": 0.021779976785182953, + "step": 405 + }, + { + "clip_ratio/high_max": 7.957685966175632e-06, + "clip_ratio/high_mean": 1.989421491543908e-06, + "clip_ratio/low_mean": 3.758041248147492e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.956983414354909e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15669.0, + "completions/mean_length": 7620.21875, + "completions/mean_terminated_length": 7189.212890625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 1.035948596894741, + "epoch": 0.3735050597976081, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031219006050378084, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 356675829.0, + "reward": 0.296875, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001060962677002, + "sampling/importance_sampling_ratio/min": 0.010141897015273571, + "sampling/sampling_logp_difference/max": 4.591080188751221, + "sampling/sampling_logp_difference/mean": 0.021951109170913696, + "step": 406 + }, + { + "clip_ratio/high_max": 2.286768199155631e-05, + "clip_ratio/high_mean": 5.7169204978890775e-06, + "clip_ratio/low_mean": 3.914574369900947e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.486266482217616e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14038.0, + "completions/mean_length": 5806.0234375, + "completions/mean_terminated_length": 5638.119140625, + "completions/min_length": 1319.0, + "completions/min_terminated_length": 1319.0, + "entropy": 0.8977029845118523, + "epoch": 0.37442502299908004, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002810312667861581, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 357438712.0, + "reward": 0.546875, + "reward_std": 0.22832970321178436, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999280571937561, + "sampling/importance_sampling_ratio/min": 0.0011738575994968414, + "sampling/sampling_logp_difference/max": 6.747459888458252, + "sampling/sampling_logp_difference/mean": 0.01965375244617462, + "step": 407 + }, + { + "clip_ratio/high_max": 1.2219379641464911e-05, + "clip_ratio/high_mean": 3.054844910366228e-06, + "clip_ratio/low_mean": 3.186109779562685e-05, + "clip_ratio/low_min": 4.3511558942554984e-06, + "clip_ratio/region_mean": 3.4915943160740426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15705.0, + "completions/max_terminated_length": 15705.0, + "completions/mean_length": 6537.4609375, + "completions/mean_terminated_length": 6537.4609375, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.9577726796269417, + "epoch": 0.37534498620055196, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004516562446951866, + "learning_rate": 1e-05, + "loss": 0.0517, + "num_tokens": 358296731.0, + "reward": 0.3828125, + "reward_std": 0.1830746978521347, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999170303344727, + "sampling/importance_sampling_ratio/min": 2.384942035860149e-06, + "sampling/sampling_logp_difference/max": 12.946335792541504, + "sampling/sampling_logp_difference/mean": 0.021242395043373108, + "step": 408 + }, + { + "clip_ratio/high_max": 1.4422689218918094e-05, + "clip_ratio/high_mean": 3.6056723047295236e-06, + "clip_ratio/low_mean": 3.026239573955536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3868068385345396e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 7896.671875, + "completions/mean_terminated_length": 7622.88671875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.9163230583071709, + "epoch": 0.37626494940202393, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003542230697348714, + "learning_rate": 1e-05, + "loss": 0.05, + "num_tokens": 359327001.0, + "reward": 0.375, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998560547828674, + "sampling/importance_sampling_ratio/min": 0.00010891625424847007, + "sampling/sampling_logp_difference/max": 9.124931335449219, + "sampling/sampling_logp_difference/mean": 0.020085681229829788, + "step": 409 + }, + { + "clip_ratio/high_max": 1.7827243254942005e-05, + "clip_ratio/high_mean": 5.474494003010477e-06, + "clip_ratio/low_mean": 4.2465159026505717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.793965263161226e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15297.0, + "completions/mean_length": 6728.7109375, + "completions/mean_terminated_length": 6652.68505859375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9010183215141296, + "epoch": 0.37718491260349585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0035069347359240055, + "learning_rate": 1e-05, + "loss": 0.0518, + "num_tokens": 360208780.0, + "reward": 0.5390625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999571442604065, + "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05, + "sampling/sampling_logp_difference/max": 11.124998092651367, + "sampling/sampling_logp_difference/mean": 0.021022530272603035, + "step": 410 + }, + { + "clip_ratio/high_max": 1.0376989393989788e-05, + "clip_ratio/high_mean": 2.594247348497447e-06, + "clip_ratio/low_mean": 2.8587513156708155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1181759936771414e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6800.3984375, + "completions/mean_terminated_length": 6491.25, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.8654960840940475, + "epoch": 0.3781048758049678, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033910400234162807, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 361098567.0, + "reward": 0.5625, + "reward_std": 0.2306838035583496, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998576641082764, + "sampling/importance_sampling_ratio/min": 0.001449413481168449, + "sampling/sampling_logp_difference/max": 6.536596298217773, + "sampling/sampling_logp_difference/mean": 0.019660964608192444, + "step": 411 + }, + { + "clip_ratio/high_max": 2.3068858354236e-05, + "clip_ratio/high_mean": 7.792090059410839e-06, + "clip_ratio/low_mean": 5.8515578757578623e-05, + "clip_ratio/low_min": 1.0348648629587842e-05, + "clip_ratio/region_mean": 6.630766870330262e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7103.4453125, + "completions/mean_terminated_length": 6956.13525390625, + "completions/min_length": 1711.0, + "completions/min_terminated_length": 1711.0, + "entropy": 0.8317076042294502, + "epoch": 0.37902483900643974, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036110079381614923, + "learning_rate": 1e-05, + "loss": 0.0834, + "num_tokens": 362027520.0, + "reward": 0.546875, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999338984489441, + "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05, + "sampling/sampling_logp_difference/max": 11.458046913146973, + "sampling/sampling_logp_difference/mean": 0.01939362846314907, + "step": 412 + }, + { + "clip_ratio/high_max": 3.112394779236638e-06, + "clip_ratio/high_mean": 7.780986948091595e-07, + "clip_ratio/low_mean": 5.127149995587388e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.204959859383962e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15830.0, + "completions/mean_length": 7344.9296875, + "completions/mean_terminated_length": 6900.384765625, + "completions/min_length": 1368.0, + "completions/min_terminated_length": 1368.0, + "entropy": 0.8387318029999733, + "epoch": 0.37994480220791166, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002141098491847515, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 362985207.0, + "reward": 0.34375, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322891235352, + "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05, + "sampling/sampling_logp_difference/max": 10.874617576599121, + "sampling/sampling_logp_difference/mean": 0.01929464004933834, + "step": 413 + }, + { + "clip_ratio/high_max": 5.2602786126954015e-06, + "clip_ratio/high_mean": 1.3150696531738504e-06, + "clip_ratio/low_mean": 1.7854434247510653e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9169503786997666e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16137.0, + "completions/mean_length": 6377.7734375, + "completions/mean_terminated_length": 6218.94482421875, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9732858911156654, + "epoch": 0.38086476540938363, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015244127716869116, + "learning_rate": 1e-05, + "loss": 0.0608, + "num_tokens": 363823914.0, + "reward": 0.4375, + "reward_std": 0.1988610327243805, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 0.006335465237498283, + "sampling/sampling_logp_difference/max": 5.061592102050781, + "sampling/sampling_logp_difference/mean": 0.020688029006123543, + "step": 414 + }, + { + "clip_ratio/high_max": 2.6195500595349586e-05, + "clip_ratio/high_mean": 6.548875148837396e-06, + "clip_ratio/low_mean": 3.3802934012783226e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035180882056011e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14456.0, + "completions/mean_length": 5599.7890625, + "completions/mean_terminated_length": 5340.96826171875, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8872368410229683, + "epoch": 0.38178472861085555, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002647512126713991, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 364561127.0, + "reward": 0.453125, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999077916145325, + "sampling/importance_sampling_ratio/min": 2.370526999584399e-06, + "sampling/sampling_logp_difference/max": 12.952398300170898, + "sampling/sampling_logp_difference/mean": 0.01878243312239647, + "step": 415 + }, + { + "clip_ratio/high_max": 2.157278959202813e-05, + "clip_ratio/high_mean": 5.3931973980070325e-06, + "clip_ratio/low_mean": 7.215861739950924e-05, + "clip_ratio/low_min": 1.4898997051204788e-05, + "clip_ratio/region_mean": 7.755181559332414e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15905.0, + "completions/mean_length": 7877.2890625, + "completions/mean_terminated_length": 7385.1650390625, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.8416353687644005, + "epoch": 0.3827046918123275, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018051012884825468, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 365590124.0, + "reward": 0.3125, + "reward_std": 0.28407180309295654, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.0004095165350008756, + "sampling/sampling_logp_difference/max": 7.800533294677734, + "sampling/sampling_logp_difference/mean": 0.019809434190392494, + "step": 416 + }, + { + "clip_ratio/high_max": 2.540994637456606e-05, + "clip_ratio/high_mean": 6.352486593641515e-06, + "clip_ratio/low_mean": 4.230594890941575e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8658435844117776e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16083.0, + "completions/mean_length": 6836.7890625, + "completions/mean_terminated_length": 6200.30859375, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.8647575601935387, + "epoch": 0.38362465501379944, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004550795070827007, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 366486337.0, + "reward": 0.40625, + "reward_std": 0.22620806097984314, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873638153076, + "sampling/importance_sampling_ratio/min": 0.0001089095021598041, + "sampling/sampling_logp_difference/max": 9.124993324279785, + "sampling/sampling_logp_difference/mean": 0.01992485672235489, + "step": 417 + }, + { + "clip_ratio/high_max": 1.1592664577619871e-05, + "clip_ratio/high_mean": 2.8981661444049678e-06, + "clip_ratio/low_mean": 3.5717548257707676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.861571451579948e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16286.0, + "completions/mean_length": 6884.953125, + "completions/mean_terminated_length": 6417.78662109375, + "completions/min_length": 1289.0, + "completions/min_terminated_length": 1289.0, + "entropy": 0.8691708743572235, + "epoch": 0.3845446182152714, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005958946421742439, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 367386163.0, + "reward": 0.5078125, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 9.519772902422119e-06, + "sampling/sampling_logp_difference/max": 11.562139511108398, + "sampling/sampling_logp_difference/mean": 0.019436441361904144, + "step": 418 + }, + { + "clip_ratio/high_max": 2.7658640192385064e-05, + "clip_ratio/high_mean": 8.455849524580117e-06, + "clip_ratio/low_mean": 3.938097847822064e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7836828116487595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15574.0, + "completions/mean_length": 7439.1328125, + "completions/mean_terminated_length": 7150.58837890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.795464999973774, + "epoch": 0.38546458141674333, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00558120384812355, + "learning_rate": 1e-05, + "loss": 0.1918, + "num_tokens": 368357500.0, + "reward": 0.609375, + "reward_std": 0.3795146346092224, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.0001159337698481977, + "sampling/sampling_logp_difference/max": 9.062491416931152, + "sampling/sampling_logp_difference/mean": 0.018824251368641853, + "step": 419 + }, + { + "clip_ratio/high_max": 8.509555527780321e-06, + "clip_ratio/high_mean": 2.1273888819450804e-06, + "clip_ratio/low_mean": 3.0958593640662e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.308598269313734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16236.0, + "completions/mean_length": 6751.53125, + "completions/mean_terminated_length": 6520.3525390625, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 0.9450879693031311, + "epoch": 0.38638454461821525, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004628168884664774, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 369242920.0, + "reward": 0.359375, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.0006074689445085824, + "sampling/sampling_logp_difference/max": 7.406209468841553, + "sampling/sampling_logp_difference/mean": 0.019376013427972794, + "step": 420 + }, + { + "clip_ratio/high_max": 1.8288420505996328e-05, + "clip_ratio/high_mean": 4.572105126499082e-06, + "clip_ratio/low_mean": 4.86290555272717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320115997164976e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16164.0, + "completions/mean_length": 7023.296875, + "completions/mean_terminated_length": 6315.3447265625, + "completions/min_length": 1628.0, + "completions/min_terminated_length": 1628.0, + "entropy": 0.7378111630678177, + "epoch": 0.3873045078196872, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00389425759203732, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 370159510.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999127388000488, + "sampling/importance_sampling_ratio/min": 0.00014012664905749261, + "sampling/sampling_logp_difference/max": 8.872963905334473, + "sampling/sampling_logp_difference/mean": 0.016914553940296173, + "step": 421 + }, + { + "clip_ratio/high_max": 2.1269573153404053e-05, + "clip_ratio/high_mean": 5.948400371380558e-06, + "clip_ratio/low_mean": 2.3538930747690756e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9487331687505502e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16018.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 7702.3046875, + "completions/mean_terminated_length": 7702.3046875, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.9053447172045708, + "epoch": 0.38822447102115915, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004324545152485371, + "learning_rate": 1e-05, + "loss": 0.0149, + "num_tokens": 371162773.0, + "reward": 0.2421875, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00001060962677, + "sampling/importance_sampling_ratio/min": 2.283278627146501e-05, + "sampling/sampling_logp_difference/max": 10.687313079833984, + "sampling/sampling_logp_difference/mean": 0.020495830103754997, + "step": 422 + }, + { + "clip_ratio/high_max": 1.0294916819475475e-05, + "clip_ratio/high_mean": 2.5737292048688687e-06, + "clip_ratio/low_mean": 5.831611520079605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.088984559937671e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 6904.78125, + "completions/mean_terminated_length": 6754.31787109375, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.7991176024079323, + "epoch": 0.3891444342226311, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003239463549107313, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 372067241.0, + "reward": 0.328125, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00012340991816017777, + "sampling/sampling_logp_difference/max": 8.999999046325684, + "sampling/sampling_logp_difference/mean": 0.019042208790779114, + "step": 423 + }, + { + "clip_ratio/high_max": 2.7261318791715894e-05, + "clip_ratio/high_mean": 7.926559305815317e-06, + "clip_ratio/low_mean": 1.552133551285806e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3447895273420727e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15399.0, + "completions/mean_length": 6107.7421875, + "completions/mean_terminated_length": 5602.35205078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.9495253190398216, + "epoch": 0.39006439742410304, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015464330790564418, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 372866072.0, + "reward": 0.421875, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971330165863, + "sampling/importance_sampling_ratio/min": 0.00024684349773451686, + "sampling/sampling_logp_difference/max": 8.306756019592285, + "sampling/sampling_logp_difference/mean": 0.019793221727013588, + "step": 424 + }, + { + "clip_ratio/high_max": 2.457227401464479e-05, + "clip_ratio/high_mean": 8.533324717063806e-06, + "clip_ratio/low_mean": 3.261690835643094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.115023284612107e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15939.0, + "completions/mean_length": 6079.8046875, + "completions/mean_terminated_length": 5747.4111328125, + "completions/min_length": 1082.0, + "completions/min_terminated_length": 1082.0, + "entropy": 0.8005363270640373, + "epoch": 0.39098436062557496, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024811832699924707, + "learning_rate": 1e-05, + "loss": 0.1124, + "num_tokens": 373663463.0, + "reward": 0.625, + "reward_std": 0.2630355656147003, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743103981018, + "sampling/importance_sampling_ratio/min": 0.00019348970090504736, + "sampling/sampling_logp_difference/max": 8.550286293029785, + "sampling/sampling_logp_difference/mean": 0.017151469364762306, + "step": 425 + }, + { + "clip_ratio/high_max": 3.3719989005476236e-06, + "clip_ratio/high_mean": 8.429997251369059e-07, + "clip_ratio/low_mean": 2.132218082806503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2165180553201935e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14925.0, + "completions/mean_length": 6453.7890625, + "completions/mean_terminated_length": 6375.5986328125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "entropy": 0.9212624430656433, + "epoch": 0.39190432382704693, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031475063879042864, + "learning_rate": 1e-05, + "loss": 0.0959, + "num_tokens": 374517492.0, + "reward": 0.34375, + "reward_std": 0.19910329580307007, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999594688415527, + "sampling/importance_sampling_ratio/min": 0.015664709731936455, + "sampling/sampling_logp_difference/max": 4.156344890594482, + "sampling/sampling_logp_difference/mean": 0.019899867475032806, + "step": 426 + }, + { + "clip_ratio/high_max": 1.907509408738406e-05, + "clip_ratio/high_mean": 5.984868664654641e-06, + "clip_ratio/low_mean": 3.784128080042137e-05, + "clip_ratio/low_min": 3.7751804029539926e-06, + "clip_ratio/region_mean": 4.382614952191943e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16159.0, + "completions/max_terminated_length": 16159.0, + "completions/mean_length": 6126.9921875, + "completions/mean_terminated_length": 6126.9921875, + "completions/min_length": 1106.0, + "completions/min_terminated_length": 1106.0, + "entropy": 0.8252849578857422, + "epoch": 0.39282428702851885, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004200868774205446, + "learning_rate": 1e-05, + "loss": 0.0276, + "num_tokens": 375320339.0, + "reward": 0.4140625, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999815225601196, + "sampling/importance_sampling_ratio/min": 0.005763276945799589, + "sampling/sampling_logp_difference/max": 5.156249046325684, + "sampling/sampling_logp_difference/mean": 0.01833093911409378, + "step": 427 + }, + { + "clip_ratio/high_max": 1.8918785372079583e-05, + "clip_ratio/high_mean": 5.476571459439583e-06, + "clip_ratio/low_mean": 6.169724406390742e-05, + "clip_ratio/low_min": 7.494657666029525e-06, + "clip_ratio/region_mean": 6.717381506859965e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15411.0, + "completions/mean_length": 6739.09375, + "completions/mean_terminated_length": 6427.9677734375, + "completions/min_length": 1228.0, + "completions/min_terminated_length": 1228.0, + "entropy": 0.8008574098348618, + "epoch": 0.3937442502299908, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003204014617949724, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 376201015.0, + "reward": 0.5390625, + "reward_std": 0.37086254358291626, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998303651809692, + "sampling/importance_sampling_ratio/min": 0.00010144581028725952, + "sampling/sampling_logp_difference/max": 9.195985794067383, + "sampling/sampling_logp_difference/mean": 0.018961725756525993, + "step": 428 + }, + { + "clip_ratio/high_max": 1.3558789078160771e-05, + "clip_ratio/high_mean": 3.389697269540193e-06, + "clip_ratio/low_mean": 5.3925050679026754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.731474743697618e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15634.0, + "completions/mean_length": 7245.8984375, + "completions/mean_terminated_length": 6951.12060546875, + "completions/min_length": 1306.0, + "completions/min_terminated_length": 1306.0, + "entropy": 1.0351596996188164, + "epoch": 0.39466421343146274, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0039763906970620155, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 377149650.0, + "reward": 0.375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000600814819336, + "sampling/importance_sampling_ratio/min": 8.106228051474318e-05, + "sampling/sampling_logp_difference/max": 9.420292854309082, + "sampling/sampling_logp_difference/mean": 0.020948028191924095, + "step": 429 + }, + { + "clip_ratio/high_max": 1.4580486549675697e-05, + "clip_ratio/high_mean": 4.259903903403028e-06, + "clip_ratio/low_mean": 4.6149686397711775e-05, + "clip_ratio/low_min": 3.006686938533676e-06, + "clip_ratio/region_mean": 5.04095905853319e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 6958.625, + "completions/mean_terminated_length": 6495.08154296875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.8360240310430527, + "epoch": 0.39558417663293466, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0031417158897966146, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 378057802.0, + "reward": 0.515625, + "reward_std": 0.35771697759628296, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999384880065918, + "sampling/importance_sampling_ratio/min": 0.00010235882655251771, + "sampling/sampling_logp_difference/max": 9.187026023864746, + "sampling/sampling_logp_difference/mean": 0.019185224547982216, + "step": 430 + }, + { + "clip_ratio/high_max": 6.681633749394678e-06, + "clip_ratio/high_mean": 1.6704084373486694e-06, + "clip_ratio/low_mean": 5.096616632727091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.263657521936693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15410.0, + "completions/max_terminated_length": 15410.0, + "completions/mean_length": 5696.3984375, + "completions/mean_terminated_length": 5696.3984375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.7887749597430229, + "epoch": 0.39650413983440663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004943124484270811, + "learning_rate": 1e-05, + "loss": 0.096, + "num_tokens": 378808021.0, + "reward": 0.515625, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999057054519653, + "sampling/importance_sampling_ratio/min": 0.0015042300801724195, + "sampling/sampling_logp_difference/max": 6.499474048614502, + "sampling/sampling_logp_difference/mean": 0.018845941871404648, + "step": 431 + }, + { + "clip_ratio/high_max": 1.7526824194646906e-05, + "clip_ratio/high_mean": 5.417880970526312e-06, + "clip_ratio/low_mean": 3.513921649300755e-05, + "clip_ratio/low_min": 6.075038982089609e-06, + "clip_ratio/region_mean": 4.0557096895099676e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14233.0, + "completions/mean_length": 6480.8828125, + "completions/mean_terminated_length": 6323.69091796875, + "completions/min_length": 1013.0, + "completions/min_terminated_length": 1013.0, + "entropy": 0.8796411231160164, + "epoch": 0.39742410303587855, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00595651101320982, + "learning_rate": 1e-05, + "loss": 0.0546, + "num_tokens": 379659710.0, + "reward": 0.3984375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 0.0017907419241964817, + "sampling/sampling_logp_difference/max": 6.325125217437744, + "sampling/sampling_logp_difference/mean": 0.01906527951359749, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4512424602107785e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4512424602107785e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7501.703125, + "completions/mean_terminated_length": 6829.93310546875, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "entropy": 0.786028303205967, + "epoch": 0.3983440662373505, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0024527597706764936, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 380640720.0, + "reward": 0.5234375, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999595880508423, + "sampling/importance_sampling_ratio/min": 8.851602615322918e-07, + "sampling/sampling_logp_difference/max": 13.93749713897705, + "sampling/sampling_logp_difference/mean": 0.01873261108994484, + "step": 433 + }, + { + "clip_ratio/high_max": 1.4606259583160863e-05, + "clip_ratio/high_mean": 5.505394312876888e-06, + "clip_ratio/low_mean": 3.1679782978244475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7185177234277944e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15185.0, + "completions/mean_length": 5619.2890625, + "completions/mean_terminated_length": 5448.4208984375, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.8098893761634827, + "epoch": 0.39926402943882244, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004280989523977041, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 381377981.0, + "reward": 0.609375, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.0010248658945783973, + "sampling/sampling_logp_difference/max": 6.883193492889404, + "sampling/sampling_logp_difference/mean": 0.017923470586538315, + "step": 434 + }, + { + "clip_ratio/high_max": 1.4808703554081148e-05, + "clip_ratio/high_mean": 3.702175888520287e-06, + "clip_ratio/low_mean": 2.3637440563106793e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7339616224253405e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5243.8203125, + "completions/mean_terminated_length": 5156.1025390625, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "entropy": 0.7485036551952362, + "epoch": 0.40018399264029436, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004721642471849918, + "learning_rate": 1e-05, + "loss": 0.0877, + "num_tokens": 382070478.0, + "reward": 0.6875, + "reward_std": 0.26538965106010437, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999414086341858, + "sampling/importance_sampling_ratio/min": 0.0011518355458974838, + "sampling/sampling_logp_difference/max": 6.7663984298706055, + "sampling/sampling_logp_difference/mean": 0.016579966992139816, + "step": 435 + }, + { + "clip_ratio/high_max": 3.1177480195765384e-05, + "clip_ratio/high_mean": 1.1174359769938746e-05, + "clip_ratio/low_mean": 3.602651599976525e-05, + "clip_ratio/low_min": 4.348733455117326e-06, + "clip_ratio/region_mean": 4.720087713394605e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15978.0, + "completions/mean_length": 7021.1796875, + "completions/mean_terminated_length": 6872.56396484375, + "completions/min_length": 1371.0, + "completions/min_terminated_length": 1371.0, + "entropy": 0.8693460151553154, + "epoch": 0.40110395584176634, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00329192029312253, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 382990245.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.0023386883549392223, + "sampling/sampling_logp_difference/max": 6.058165073394775, + "sampling/sampling_logp_difference/mean": 0.019863136112689972, + "step": 436 + }, + { + "clip_ratio/high_max": 1.1192694955752813e-05, + "clip_ratio/high_mean": 2.7981737389382033e-06, + "clip_ratio/low_mean": 4.9078003257818636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.1876177280973934e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15344.0, + "completions/mean_length": 6917.625, + "completions/mean_terminated_length": 6452.0654296875, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "entropy": 0.8466897681355476, + "epoch": 0.40202391904323825, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0051889242604374886, + "learning_rate": 1e-05, + "loss": 0.1009, + "num_tokens": 383896717.0, + "reward": 0.4140625, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999983310699463, + "sampling/importance_sampling_ratio/min": 0.00015846389578655362, + "sampling/sampling_logp_difference/max": 8.749983787536621, + "sampling/sampling_logp_difference/mean": 0.019528398290276527, + "step": 437 + }, + { + "clip_ratio/high_max": 2.3224948108691024e-05, + "clip_ratio/high_mean": 8.263948757303297e-06, + "clip_ratio/low_mean": 3.8556312347282073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.682026019509067e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16175.0, + "completions/mean_length": 7487.5078125, + "completions/mean_terminated_length": 7346.2939453125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.9584660083055496, + "epoch": 0.4029438822447102, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002855573548004031, + "learning_rate": 1e-05, + "loss": 0.0087, + "num_tokens": 384872622.0, + "reward": 0.3828125, + "reward_std": 0.2477683424949646, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999386668205261, + "sampling/importance_sampling_ratio/min": 0.0038593418430536985, + "sampling/sampling_logp_difference/max": 5.557258605957031, + "sampling/sampling_logp_difference/mean": 0.0209865253418684, + "step": 438 + }, + { + "clip_ratio/high_max": 6.171620498207631e-06, + "clip_ratio/high_mean": 1.5429051245519076e-06, + "clip_ratio/low_mean": 2.98128834401723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.135578845103737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16092.0, + "completions/mean_length": 6637.5078125, + "completions/mean_terminated_length": 6323.1044921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 0.8841215297579765, + "epoch": 0.40386384544618215, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004437311552464962, + "learning_rate": 1e-05, + "loss": 0.0523, + "num_tokens": 385744023.0, + "reward": 0.3984375, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999136924743652, + "sampling/importance_sampling_ratio/min": 0.002925124252215028, + "sampling/sampling_logp_difference/max": 5.834418296813965, + "sampling/sampling_logp_difference/mean": 0.019490888342261314, + "step": 439 + }, + { + "clip_ratio/high_max": 1.3304874300956726e-05, + "clip_ratio/high_mean": 3.3262185752391815e-06, + "clip_ratio/low_mean": 5.443932013804442e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.776553894065728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15143.0, + "completions/mean_length": 5965.9765625, + "completions/mean_terminated_length": 5800.611328125, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "entropy": 0.8726934269070625, + "epoch": 0.4047838086476541, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002463799435645342, + "learning_rate": 1e-05, + "loss": -0.0075, + "num_tokens": 386525492.0, + "reward": 0.3984375, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.00020367901015561074, + "sampling/sampling_logp_difference/max": 8.4989652633667, + "sampling/sampling_logp_difference/mean": 0.01946769654750824, + "step": 440 + }, + { + "clip_ratio/high_max": 1.0084711902891286e-05, + "clip_ratio/high_mean": 3.6154040117253317e-06, + "clip_ratio/low_mean": 3.598771945689805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9603123695997056e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6693.109375, + "completions/mean_terminated_length": 6616.80322265625, + "completions/min_length": 1704.0, + "completions/min_terminated_length": 1704.0, + "entropy": 0.9430640190839767, + "epoch": 0.40570377184912604, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038990566972643137, + "learning_rate": 1e-05, + "loss": 0.0415, + "num_tokens": 387404842.0, + "reward": 0.421875, + "reward_std": 0.31587693095207214, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999700784683228, + "sampling/importance_sampling_ratio/min": 0.0011708902893587947, + "sampling/sampling_logp_difference/max": 6.749990940093994, + "sampling/sampling_logp_difference/mean": 0.020848294720053673, + "step": 441 + }, + { + "clip_ratio/high_max": 7.462686426151777e-06, + "clip_ratio/high_mean": 1.8656716065379442e-06, + "clip_ratio/low_mean": 5.234285907818048e-05, + "clip_ratio/low_min": 4.47803950009984e-06, + "clip_ratio/region_mean": 5.420853057103159e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7045.6953125, + "completions/mean_terminated_length": 6505.46240234375, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "entropy": 0.8912066072225571, + "epoch": 0.40662373505059796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018510994268581271, + "learning_rate": 1e-05, + "loss": 0.099, + "num_tokens": 388324475.0, + "reward": 0.40625, + "reward_std": 0.32195523381233215, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999024868011475, + "sampling/importance_sampling_ratio/min": 0.0031757301185280085, + "sampling/sampling_logp_difference/max": 5.752217769622803, + "sampling/sampling_logp_difference/mean": 0.020547039806842804, + "step": 442 + }, + { + "clip_ratio/high_max": 2.504527083146968e-05, + "clip_ratio/high_mean": 6.26131770786742e-06, + "clip_ratio/low_mean": 6.165269871871715e-05, + "clip_ratio/low_min": 3.5272871627967106e-06, + "clip_ratio/region_mean": 6.791401551708987e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15734.0, + "completions/mean_length": 7480.0078125, + "completions/mean_terminated_length": 7266.3125, + "completions/min_length": 1130.0, + "completions/min_terminated_length": 1130.0, + "entropy": 0.8813760280609131, + "epoch": 0.40754369825206993, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004439481534063816, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 389305644.0, + "reward": 0.34375, + "reward_std": 0.31300368905067444, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999762773513794, + "sampling/importance_sampling_ratio/min": 0.007449973840266466, + "sampling/sampling_logp_difference/max": 4.899544715881348, + "sampling/sampling_logp_difference/mean": 0.01973455585539341, + "step": 443 + }, + { + "clip_ratio/high_max": 4.0980917219712865e-06, + "clip_ratio/high_mean": 1.0245229304928216e-06, + "clip_ratio/low_mean": 3.662567087303614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.76501939172158e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15302.0, + "completions/max_terminated_length": 15302.0, + "completions/mean_length": 7044.4453125, + "completions/mean_terminated_length": 7044.4453125, + "completions/min_length": 1229.0, + "completions/min_terminated_length": 1229.0, + "entropy": 0.9901906549930573, + "epoch": 0.40846366145354185, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004181519150733948, + "learning_rate": 1e-05, + "loss": -0.0068, + "num_tokens": 390229373.0, + "reward": 0.421875, + "reward_std": 0.17700131237506866, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000314712524414, + "sampling/importance_sampling_ratio/min": 0.00022536676260642707, + "sampling/sampling_logp_difference/max": 8.397781372070312, + "sampling/sampling_logp_difference/mean": 0.021211043000221252, + "step": 444 + }, + { + "clip_ratio/high_max": 1.4909872106727562e-05, + "clip_ratio/high_mean": 3.7274680266818905e-06, + "clip_ratio/low_mean": 5.29995777469594e-05, + "clip_ratio/low_min": 3.708758640641463e-06, + "clip_ratio/region_mean": 5.672704537573736e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7815.8125, + "completions/mean_terminated_length": 7244.6005859375, + "completions/min_length": 1350.0, + "completions/min_terminated_length": 1350.0, + "entropy": 0.8278292864561081, + "epoch": 0.4093836246550138, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002691390924155712, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 391251141.0, + "reward": 0.3515625, + "reward_std": 0.31222954392433167, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 0.007715471088886261, + "sampling/sampling_logp_difference/max": 4.864527702331543, + "sampling/sampling_logp_difference/mean": 0.018415704369544983, + "step": 445 + }, + { + "clip_ratio/high_max": 2.1858722902834415e-05, + "clip_ratio/high_mean": 6.629899417021079e-06, + "clip_ratio/low_mean": 3.196247394043894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.859237290271267e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15202.0, + "completions/mean_length": 5305.1796875, + "completions/mean_terminated_length": 5217.94482421875, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.8100772425532341, + "epoch": 0.41030358785648574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0069543467834591866, + "learning_rate": 1e-05, + "loss": 0.1153, + "num_tokens": 391956196.0, + "reward": 0.609375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000190734863281, + "sampling/importance_sampling_ratio/min": 0.0024869756307452917, + "sampling/sampling_logp_difference/max": 5.996687889099121, + "sampling/sampling_logp_difference/mean": 0.017318082973361015, + "step": 446 + }, + { + "clip_ratio/high_max": 2.461934036546154e-05, + "clip_ratio/high_mean": 8.056288947955181e-06, + "clip_ratio/low_mean": 5.289376917971822e-05, + "clip_ratio/low_min": 4.21926688431995e-06, + "clip_ratio/region_mean": 6.0950058468733914e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15300.0, + "completions/mean_length": 7299.578125, + "completions/mean_terminated_length": 6930.29248046875, + "completions/min_length": 1008.0, + "completions/min_terminated_length": 1008.0, + "entropy": 0.9955824315547943, + "epoch": 0.41122355105795766, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0065611582249403, + "learning_rate": 1e-05, + "loss": 0.0883, + "num_tokens": 392908430.0, + "reward": 0.4375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999696016311646, + "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06, + "sampling/sampling_logp_difference/max": 11.873339653015137, + "sampling/sampling_logp_difference/mean": 0.02127375639975071, + "step": 447 + }, + { + "clip_ratio/high_max": 2.4339562514796853e-05, + "clip_ratio/high_mean": 7.412756531266496e-06, + "clip_ratio/low_mean": 3.89272447591793e-05, + "clip_ratio/low_min": 4.047796210215893e-06, + "clip_ratio/region_mean": 4.6340001517819474e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 6702.9375, + "completions/mean_terminated_length": 6390.64501953125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.82919991761446, + "epoch": 0.41214351425942963, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032975098583847284, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 393788286.0, + "reward": 0.4609375, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.00028582560480572283, + "sampling/sampling_logp_difference/max": 8.160128593444824, + "sampling/sampling_logp_difference/mean": 0.019461583346128464, + "step": 448 + }, + { + "clip_ratio/high_max": 2.3807599063729867e-05, + "clip_ratio/high_mean": 5.951899765932467e-06, + "clip_ratio/low_mean": 3.195798365140945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.790988330365508e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15244.0, + "completions/mean_length": 6468.9453125, + "completions/mean_terminated_length": 5536.7607421875, + "completions/min_length": 808.0, + "completions/min_terminated_length": 808.0, + "entropy": 0.6471721827983856, + "epoch": 0.41306347746090155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032787907402962446, + "learning_rate": 1e-05, + "loss": 0.1149, + "num_tokens": 394638159.0, + "reward": 0.625, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.00012341380352154374, + "sampling/sampling_logp_difference/max": 8.999967575073242, + "sampling/sampling_logp_difference/mean": 0.016151495277881622, + "step": 449 + }, + { + "clip_ratio/high_max": 2.247072688987828e-05, + "clip_ratio/high_mean": 5.61768172246957e-06, + "clip_ratio/low_mean": 6.035319393049576e-05, + "clip_ratio/low_min": 4.063190772285452e-06, + "clip_ratio/region_mean": 6.597087667614687e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15931.0, + "completions/mean_length": 6547.3203125, + "completions/mean_terminated_length": 6230.0078125, + "completions/min_length": 587.0, + "completions/min_terminated_length": 587.0, + "entropy": 0.9123960956931114, + "epoch": 0.4139834406623735, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038375966250896454, + "learning_rate": 1e-05, + "loss": 0.0967, + "num_tokens": 395493872.0, + "reward": 0.4296875, + "reward_std": 0.30798619985580444, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.00016009423416107893, + "sampling/sampling_logp_difference/max": 8.739748001098633, + "sampling/sampling_logp_difference/mean": 0.019957344979047775, + "step": 450 + }, + { + "clip_ratio/high_max": 1.404482372890925e-05, + "clip_ratio/high_mean": 3.5112059322273126e-06, + "clip_ratio/low_mean": 2.315102483407827e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6662230766305584e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15058.0, + "completions/mean_length": 6291.859375, + "completions/mean_terminated_length": 6131.6669921875, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 0.9841655194759369, + "epoch": 0.41490340386384544, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003903903067111969, + "learning_rate": 1e-05, + "loss": 0.0656, + "num_tokens": 396320254.0, + "reward": 0.4296875, + "reward_std": 0.2569621503353119, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 6.564632712979801e-06, + "sampling/sampling_logp_difference/max": 11.93381404876709, + "sampling/sampling_logp_difference/mean": 0.020753150805830956, + "step": 451 + }, + { + "clip_ratio/high_max": 1.5189204987109406e-05, + "clip_ratio/high_mean": 4.615214265868417e-06, + "clip_ratio/low_mean": 3.547988831087423e-05, + "clip_ratio/low_min": 3.3967392027989263e-06, + "clip_ratio/region_mean": 4.009510257674265e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15966.0, + "completions/mean_length": 7692.4296875, + "completions/mean_terminated_length": 7339.11376953125, + "completions/min_length": 1269.0, + "completions/min_terminated_length": 1269.0, + "entropy": 0.94080401211977, + "epoch": 0.41582336706531736, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005152889993041754, + "learning_rate": 1e-05, + "loss": 0.0511, + "num_tokens": 397327029.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 5.027571751270443e-05, + "sampling/sampling_logp_difference/max": 9.897988319396973, + "sampling/sampling_logp_difference/mean": 0.02036213129758835, + "step": 452 + }, + { + "clip_ratio/high_max": 1.733157705530175e-05, + "clip_ratio/high_mean": 6.0586507970583625e-06, + "clip_ratio/low_mean": 2.335082047011383e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9409470812424843e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15305.0, + "completions/mean_length": 6968.0859375, + "completions/mean_terminated_length": 6742.1044921875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9254838973283768, + "epoch": 0.41674333026678934, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035838852636516094, + "learning_rate": 1e-05, + "loss": 0.0182, + "num_tokens": 398237536.0, + "reward": 0.484375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.002404628787189722, + "sampling/sampling_logp_difference/max": 6.030359745025635, + "sampling/sampling_logp_difference/mean": 0.020200733095407486, + "step": 453 + }, + { + "clip_ratio/high_max": 4.464923677005572e-06, + "clip_ratio/high_mean": 1.116230919251393e-06, + "clip_ratio/low_mean": 3.311113533754906e-05, + "clip_ratio/low_min": 6.725854291289579e-06, + "clip_ratio/region_mean": 3.422736637048729e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16309.0, + "completions/mean_length": 8711.078125, + "completions/mean_terminated_length": 8199.55078125, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "entropy": 0.8735406622290611, + "epoch": 0.41766329346826125, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0036290446296334267, + "learning_rate": 1e-05, + "loss": 0.0412, + "num_tokens": 399373298.0, + "reward": 0.359375, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000042200088501, + "sampling/importance_sampling_ratio/min": 9.216561011271551e-05, + "sampling/sampling_logp_difference/max": 9.291923522949219, + "sampling/sampling_logp_difference/mean": 0.0201371181756258, + "step": 454 + }, + { + "clip_ratio/high_max": 3.4702664606811595e-05, + "clip_ratio/high_mean": 8.675666151702899e-06, + "clip_ratio/low_mean": 3.3217100849469716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.189276808119757e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14737.0, + "completions/mean_length": 6891.078125, + "completions/mean_terminated_length": 6663.24853515625, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "entropy": 0.8689641878008842, + "epoch": 0.41858325666973323, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004067540634423494, + "learning_rate": 1e-05, + "loss": 0.0633, + "num_tokens": 400273708.0, + "reward": 0.484375, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999425411224365, + "sampling/importance_sampling_ratio/min": 4.0002717582865444e-07, + "sampling/sampling_logp_difference/max": 14.731733322143555, + "sampling/sampling_logp_difference/mean": 0.019800148904323578, + "step": 455 + }, + { + "clip_ratio/high_max": 2.939170826721238e-06, + "clip_ratio/high_mean": 7.347927066803095e-07, + "clip_ratio/low_mean": 3.564125790944672e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6376050502440194e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15234.0, + "completions/mean_length": 6899.3515625, + "completions/mean_terminated_length": 6748.8017578125, + "completions/min_length": 1149.0, + "completions/min_terminated_length": 1149.0, + "entropy": 0.9442604705691338, + "epoch": 0.41950321987120515, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026191689539700747, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 401177497.0, + "reward": 0.46875, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 0.0017910725437104702, + "sampling/sampling_logp_difference/max": 6.3249406814575195, + "sampling/sampling_logp_difference/mean": 0.021380646154284477, + "step": 456 + }, + { + "clip_ratio/high_max": 8.99604128790088e-06, + "clip_ratio/high_mean": 2.24901032197522e-06, + "clip_ratio/low_mean": 2.57235833487357e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.797259367071092e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16226.0, + "completions/mean_length": 7175.8359375, + "completions/mean_terminated_length": 7029.6748046875, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.8653769046068192, + "epoch": 0.4204231830726771, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003141516586765647, + "learning_rate": 1e-05, + "loss": 0.0674, + "num_tokens": 402115812.0, + "reward": 0.4375, + "reward_std": 0.21040895581245422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999862909317017, + "sampling/importance_sampling_ratio/min": 0.001265019178390503, + "sampling/sampling_logp_difference/max": 6.672667980194092, + "sampling/sampling_logp_difference/mean": 0.01970163732767105, + "step": 457 + }, + { + "clip_ratio/high_max": 1.0800059499160852e-05, + "clip_ratio/high_mean": 2.700014874790213e-06, + "clip_ratio/low_mean": 3.116219727417047e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3862211807900167e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 7090.8515625, + "completions/mean_terminated_length": 6791.072265625, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.9437825232744217, + "epoch": 0.42134314627414904, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001980370609089732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 403048385.0, + "reward": 0.4609375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 1.4011449138706666e-06, + "sampling/sampling_logp_difference/max": 13.47822093963623, + "sampling/sampling_logp_difference/mean": 0.021090596914291382, + "step": 458 + }, + { + "clip_ratio/high_max": 2.5482850560365478e-05, + "clip_ratio/high_mean": 6.370712640091369e-06, + "clip_ratio/low_mean": 4.8558076969129615e-05, + "clip_ratio/low_min": 4.8952420002024155e-06, + "clip_ratio/region_mean": 5.4928788131292094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16175.0, + "completions/mean_length": 7033.65625, + "completions/mean_terminated_length": 6809.24853515625, + "completions/min_length": 1007.0, + "completions/min_terminated_length": 1007.0, + "entropy": 0.8789731040596962, + "epoch": 0.42226310947562096, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003833206370472908, + "learning_rate": 1e-05, + "loss": 0.059, + "num_tokens": 403968037.0, + "reward": 0.46875, + "reward_std": 0.28460076451301575, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000317096710205, + "sampling/importance_sampling_ratio/min": 0.0021942879538983107, + "sampling/sampling_logp_difference/max": 6.1218976974487305, + "sampling/sampling_logp_difference/mean": 0.019913772121071815, + "step": 459 + }, + { + "clip_ratio/high_max": 4.068877842655638e-06, + "clip_ratio/high_mean": 1.0172194606639096e-06, + "clip_ratio/low_mean": 6.774969961043098e-05, + "clip_ratio/low_min": 3.189914878021227e-06, + "clip_ratio/region_mean": 6.876691895740805e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 6992.8984375, + "completions/mean_terminated_length": 6611.14599609375, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "entropy": 0.857115626335144, + "epoch": 0.42318307267709293, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005315023008733988, + "learning_rate": 1e-05, + "loss": 0.1581, + "num_tokens": 404881584.0, + "reward": 0.3515625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000758171081543, + "sampling/importance_sampling_ratio/min": 4.546630952972919e-05, + "sampling/sampling_logp_difference/max": 9.998538970947266, + "sampling/sampling_logp_difference/mean": 0.01872519962489605, + "step": 460 + }, + { + "clip_ratio/high_max": 1.167047457784065e-05, + "clip_ratio/high_mean": 2.9176186444601626e-06, + "clip_ratio/low_mean": 3.3195502112448594e-05, + "clip_ratio/low_min": 5.25188033861923e-06, + "clip_ratio/region_mean": 3.611312064322192e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 6623.2578125, + "completions/mean_terminated_length": 6226.4794921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.8803941905498505, + "epoch": 0.42410303587856485, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0074885934591293335, + "learning_rate": 1e-05, + "loss": 0.1076, + "num_tokens": 405749105.0, + "reward": 0.515625, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.0011723897187039256, + "sampling/sampling_logp_difference/max": 6.748711109161377, + "sampling/sampling_logp_difference/mean": 0.01930626854300499, + "step": 461 + }, + { + "clip_ratio/high_max": 4.11753080697963e-06, + "clip_ratio/high_mean": 1.0293827017449075e-06, + "clip_ratio/low_mean": 5.09268712676203e-05, + "clip_ratio/low_min": 1.1170248626513057e-05, + "clip_ratio/region_mean": 5.195625465148623e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15032.0, + "completions/mean_length": 7244.8203125, + "completions/mean_terminated_length": 6647.5419921875, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.9202689751982689, + "epoch": 0.4250229990800368, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003960717935115099, + "learning_rate": 1e-05, + "loss": 0.0536, + "num_tokens": 406704618.0, + "reward": 0.484375, + "reward_std": 0.2880108058452606, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 1.69715603988152e-05, + "sampling/sampling_logp_difference/max": 10.98397159576416, + "sampling/sampling_logp_difference/mean": 0.02019711770117283, + "step": 462 + }, + { + "clip_ratio/high_max": 2.874629831239872e-05, + "clip_ratio/high_mean": 1.0519701334033016e-05, + "clip_ratio/low_mean": 5.367962035052187e-05, + "clip_ratio/low_min": 6.5083827394119e-06, + "clip_ratio/region_mean": 6.419932219614566e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 7462.0546875, + "completions/mean_terminated_length": 6867.2587890625, + "completions/min_length": 669.0, + "completions/min_terminated_length": 669.0, + "entropy": 0.8141553401947021, + "epoch": 0.42594296228150874, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003602087963372469, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 407677177.0, + "reward": 0.421875, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999440312385559, + "sampling/importance_sampling_ratio/min": 0.0007806668290868402, + "sampling/sampling_logp_difference/max": 7.155362129211426, + "sampling/sampling_logp_difference/mean": 0.01856713369488716, + "step": 463 + }, + { + "clip_ratio/high_max": 2.6413443720230134e-05, + "clip_ratio/high_mean": 8.973188073468918e-06, + "clip_ratio/low_mean": 3.5997712757307454e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.497090230870526e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15750.0, + "completions/mean_length": 6683.1796875, + "completions/mean_terminated_length": 6529.19873046875, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "entropy": 0.9070071652531624, + "epoch": 0.42686292548298066, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004038481041789055, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 408552512.0, + "reward": 0.4609375, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 4.474630986806005e-05, + "sampling/sampling_logp_difference/max": 10.014501571655273, + "sampling/sampling_logp_difference/mean": 0.02077356167137623, + "step": 464 + }, + { + "clip_ratio/high_max": 1.7171289982798044e-05, + "clip_ratio/high_mean": 4.292822495699511e-06, + "clip_ratio/low_mean": 3.225401701456576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654683996501262e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15864.0, + "completions/mean_length": 6472.9453125, + "completions/mean_terminated_length": 5985.51611328125, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8807859197258949, + "epoch": 0.42778288868445263, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004457853268831968, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 409399257.0, + "reward": 0.421875, + "reward_std": 0.20517179369926453, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 0.0017577135004103184, + "sampling/sampling_logp_difference/max": 6.343741416931152, + "sampling/sampling_logp_difference/mean": 0.020475786179304123, + "step": 465 + }, + { + "clip_ratio/high_max": 5.442162637336878e-05, + "clip_ratio/high_mean": 1.584139977239829e-05, + "clip_ratio/low_mean": 5.706528349946893e-05, + "clip_ratio/low_min": 2.5156462925224332e-05, + "clip_ratio/region_mean": 7.290668463610928e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15896.0, + "completions/mean_length": 5989.78125, + "completions/mean_terminated_length": 5654.48388671875, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.8479711338877678, + "epoch": 0.42870285188592455, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033953245729207993, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 410185645.0, + "reward": 0.5, + "reward_std": 0.3735082745552063, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 1.781588616722729e-05, + "sampling/sampling_logp_difference/max": 10.935420036315918, + "sampling/sampling_logp_difference/mean": 0.017986344173550606, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.2673244681500364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2673244681500364e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 8299.9453125, + "completions/mean_terminated_length": 8171.62744140625, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "entropy": 0.9363152608275414, + "epoch": 0.4296228150873965, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002381247701123357, + "learning_rate": 1e-05, + "loss": 0.0651, + "num_tokens": 411268974.0, + "reward": 0.2890625, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 0.000553094083443284, + "sampling/sampling_logp_difference/max": 7.4999823570251465, + "sampling/sampling_logp_difference/mean": 0.021354343742132187, + "step": 467 + }, + { + "clip_ratio/high_max": 8.578695997130126e-06, + "clip_ratio/high_mean": 2.1446739992825314e-06, + "clip_ratio/low_mean": 2.84454882830687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.059016239603807e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14838.0, + "completions/mean_length": 7434.0546875, + "completions/mean_terminated_length": 7219.25634765625, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "entropy": 0.981913685798645, + "epoch": 0.43054277828886844, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006341467145830393, + "learning_rate": 1e-05, + "loss": -0.003, + "num_tokens": 412238117.0, + "reward": 0.390625, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 0.0019304680172353983, + "sampling/sampling_logp_difference/max": 6.249992847442627, + "sampling/sampling_logp_difference/mean": 0.02139873616397381, + "step": 468 + }, + { + "clip_ratio/high_max": 1.7187987396027893e-05, + "clip_ratio/high_mean": 5.150076049176278e-06, + "clip_ratio/low_mean": 5.4699471832009294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.9849548279089504e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15871.0, + "completions/mean_length": 7211.1796875, + "completions/mean_terminated_length": 7138.95263671875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "entropy": 0.9307222217321396, + "epoch": 0.43146274149034036, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002621602965518832, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 413182860.0, + "reward": 0.3203125, + "reward_std": 0.34716784954071045, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999529123306274, + "sampling/importance_sampling_ratio/min": 5.1446182624204084e-05, + "sampling/sampling_logp_difference/max": 9.874974250793457, + "sampling/sampling_logp_difference/mean": 0.020250719040632248, + "step": 469 + }, + { + "clip_ratio/high_max": 1.0867412584047997e-05, + "clip_ratio/high_mean": 3.9217885614561965e-06, + "clip_ratio/low_mean": 4.7740833792886406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.16626223543426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15726.0, + "completions/mean_length": 5349.4296875, + "completions/mean_terminated_length": 5174.2783203125, + "completions/min_length": 983.0, + "completions/min_terminated_length": 983.0, + "entropy": 1.0213474333286285, + "epoch": 0.43238270469181234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035241330042481422, + "learning_rate": 1e-05, + "loss": 0.0657, + "num_tokens": 413885963.0, + "reward": 0.3046875, + "reward_std": 0.25330984592437744, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.0003569081309251487, + "sampling/sampling_logp_difference/max": 7.938032150268555, + "sampling/sampling_logp_difference/mean": 0.01975759118795395, + "step": 470 + }, + { + "clip_ratio/high_max": 1.469514609198086e-05, + "clip_ratio/high_mean": 3.673786522995215e-06, + "clip_ratio/low_mean": 2.699725871480041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0671045237795624e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15357.0, + "completions/mean_length": 7542.8515625, + "completions/mean_terminated_length": 7257.65283203125, + "completions/min_length": 1359.0, + "completions/min_terminated_length": 1359.0, + "entropy": 0.8882969543337822, + "epoch": 0.43330266789328425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014164346503093839, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 414870560.0, + "reward": 0.3671875, + "reward_std": 0.20753081142902374, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000402927398682, + "sampling/importance_sampling_ratio/min": 6.435441900976002e-05, + "sampling/sampling_logp_difference/max": 9.651104927062988, + "sampling/sampling_logp_difference/mean": 0.020874422043561935, + "step": 471 + }, + { + "clip_ratio/high_max": 1.669827497607912e-05, + "clip_ratio/high_mean": 4.17456874401978e-06, + "clip_ratio/low_mean": 3.673103901746799e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.090560787517461e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7286.90625, + "completions/mean_terminated_length": 6993.451171875, + "completions/min_length": 977.0, + "completions/min_terminated_length": 977.0, + "entropy": 0.9254636988043785, + "epoch": 0.43422263109475623, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026956009678542614, + "learning_rate": 1e-05, + "loss": 0.0567, + "num_tokens": 415825252.0, + "reward": 0.328125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999917209148407, + "sampling/importance_sampling_ratio/min": 0.0019701423589140177, + "sampling/sampling_logp_difference/max": 6.229649543762207, + "sampling/sampling_logp_difference/mean": 0.0202642735093832, + "step": 472 + }, + { + "clip_ratio/high_max": 9.162045444099931e-06, + "clip_ratio/high_mean": 2.2905113610249828e-06, + "clip_ratio/low_mean": 3.818475033767754e-05, + "clip_ratio/low_min": 7.20606476534158e-06, + "clip_ratio/region_mean": 4.047526181238936e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15908.0, + "completions/mean_length": 7244.7421875, + "completions/mean_terminated_length": 6716.0244140625, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.7817923128604889, + "epoch": 0.43514259429622815, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022128887940198183, + "learning_rate": 1e-05, + "loss": 0.0577, + "num_tokens": 416774011.0, + "reward": 0.453125, + "reward_std": 0.2937847375869751, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0015034435782581568, + "sampling/sampling_logp_difference/max": 6.499997138977051, + "sampling/sampling_logp_difference/mean": 0.01840684749186039, + "step": 473 + }, + { + "clip_ratio/high_max": 1.2232871313244686e-05, + "clip_ratio/high_mean": 3.0582178283111716e-06, + "clip_ratio/low_mean": 3.636896872194484e-05, + "clip_ratio/low_min": 3.1460788250115e-06, + "clip_ratio/region_mean": 3.9427186266038916e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16254.0, + "completions/mean_length": 9042.90625, + "completions/mean_terminated_length": 8283.482421875, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "entropy": 0.9306210279464722, + "epoch": 0.43606255749770007, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034676652867347, + "learning_rate": 1e-05, + "loss": 0.0504, + "num_tokens": 417951311.0, + "reward": 0.265625, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999234080314636, + "sampling/importance_sampling_ratio/min": 0.0002641192404553294, + "sampling/sampling_logp_difference/max": 8.239109992980957, + "sampling/sampling_logp_difference/mean": 0.02112819254398346, + "step": 474 + }, + { + "clip_ratio/high_max": 2.5187824576278217e-05, + "clip_ratio/high_mean": 8.202394610634656e-06, + "clip_ratio/low_mean": 4.3606626604741905e-05, + "clip_ratio/low_min": 3.5752079838857753e-06, + "clip_ratio/region_mean": 5.1809020988002885e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15721.0, + "completions/mean_length": 6763.6328125, + "completions/mean_terminated_length": 6610.9287109375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9879302233457565, + "epoch": 0.43698252069917204, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030218157917261124, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 418836184.0, + "reward": 0.484375, + "reward_std": 0.30091896653175354, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 0.0003778560785576701, + "sampling/sampling_logp_difference/max": 7.880997180938721, + "sampling/sampling_logp_difference/mean": 0.021101050078868866, + "step": 475 + }, + { + "clip_ratio/high_max": 1.0644185749697499e-05, + "clip_ratio/high_mean": 2.6610464374243747e-06, + "clip_ratio/low_mean": 6.21261324340594e-05, + "clip_ratio/low_min": 3.6509140954876784e-06, + "clip_ratio/region_mean": 6.478717887148377e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15675.0, + "completions/mean_length": 6794.25, + "completions/mean_terminated_length": 6564.09619140625, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 1.0259138569235802, + "epoch": 0.43790248390064396, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002881827764213085, + "learning_rate": 1e-05, + "loss": 0.0592, + "num_tokens": 419726192.0, + "reward": 0.265625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999275207519531, + "sampling/importance_sampling_ratio/min": 9.217044407705544e-07, + "sampling/sampling_logp_difference/max": 13.897041320800781, + "sampling/sampling_logp_difference/mean": 0.0210823193192482, + "step": 476 + }, + { + "clip_ratio/high_max": 1.108860487875063e-05, + "clip_ratio/high_mean": 2.7721512196876574e-06, + "clip_ratio/low_mean": 4.70996876629215e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9871839337356505e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14281.0, + "completions/max_terminated_length": 14281.0, + "completions/mean_length": 5648.2109375, + "completions/mean_terminated_length": 5648.2109375, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.88894472271204, + "epoch": 0.43882244710211593, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00289533962495625, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 420468867.0, + "reward": 0.484375, + "reward_std": 0.2675113081932068, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998449087142944, + "sampling/importance_sampling_ratio/min": 0.001372925122268498, + "sampling/sampling_logp_difference/max": 6.590811729431152, + "sampling/sampling_logp_difference/mean": 0.018499158322811127, + "step": 477 + }, + { + "clip_ratio/high_max": 4.753574557980755e-06, + "clip_ratio/high_mean": 1.1883936394951888e-06, + "clip_ratio/low_mean": 2.4103785335682915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5292179316238617e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15657.0, + "completions/mean_length": 6188.359375, + "completions/mean_terminated_length": 6026.52392578125, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "entropy": 0.8476063013076782, + "epoch": 0.43974241030358785, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002749695209786296, + "learning_rate": 1e-05, + "loss": 0.0012, + "num_tokens": 421280881.0, + "reward": 0.3671875, + "reward_std": 0.15991678833961487, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796152114868, + "sampling/importance_sampling_ratio/min": 0.004578418098390102, + "sampling/sampling_logp_difference/max": 5.386401653289795, + "sampling/sampling_logp_difference/mean": 0.018456483259797096, + "step": 478 + }, + { + "clip_ratio/high_max": 4.1359915030625416e-05, + "clip_ratio/high_mean": 1.0339978757656354e-05, + "clip_ratio/low_mean": 4.786080125995795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8200780586048495e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 6864.3515625, + "completions/mean_terminated_length": 6635.88037109375, + "completions/min_length": 1065.0, + "completions/min_terminated_length": 1065.0, + "entropy": 0.8666203916072845, + "epoch": 0.4406623735050598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.005116373300552368, + "learning_rate": 1e-05, + "loss": 0.0347, + "num_tokens": 422177822.0, + "reward": 0.4453125, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 0.00020385721290949732, + "sampling/sampling_logp_difference/max": 8.498090744018555, + "sampling/sampling_logp_difference/mean": 0.01979806460440159, + "step": 479 + }, + { + "clip_ratio/high_max": 1.4544774558089557e-05, + "clip_ratio/high_mean": 3.6361936395223893e-06, + "clip_ratio/low_mean": 4.153812756158004e-05, + "clip_ratio/low_min": 3.606462769312202e-06, + "clip_ratio/region_mean": 4.51743208031985e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 7023.828125, + "completions/mean_terminated_length": 6799.18408203125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9098334684967995, + "epoch": 0.44158233670653174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0020944855641573668, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 423096576.0, + "reward": 0.2734375, + "reward_std": 0.20858672261238098, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999480247497559, + "sampling/importance_sampling_ratio/min": 0.0027383591514080763, + "sampling/sampling_logp_difference/max": 5.900396347045898, + "sampling/sampling_logp_difference/mean": 0.020111342892050743, + "step": 480 + }, + { + "clip_ratio/high_max": 3.256236095694476e-05, + "clip_ratio/high_mean": 1.2372795026749372e-05, + "clip_ratio/low_mean": 5.0774355258909054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.314715119515313e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15527.0, + "completions/mean_length": 6666.828125, + "completions/mean_terminated_length": 6512.587890625, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "entropy": 0.9162466824054718, + "epoch": 0.44250229990800366, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003897767048329115, + "learning_rate": 1e-05, + "loss": 0.1151, + "num_tokens": 423968050.0, + "reward": 0.46875, + "reward_std": 0.3527044653892517, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0031828521750867367, + "sampling/sampling_logp_difference/max": 5.7499775886535645, + "sampling/sampling_logp_difference/mean": 0.019923247396945953, + "step": 481 + }, + { + "clip_ratio/high_max": 1.5341902098953142e-05, + "clip_ratio/high_mean": 4.791600815678976e-06, + "clip_ratio/low_mean": 7.980174223121139e-05, + "clip_ratio/low_min": 2.6713308216130827e-05, + "clip_ratio/region_mean": 8.459334412691533e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16223.0, + "completions/mean_length": 7159.8046875, + "completions/mean_terminated_length": 7013.38916015625, + "completions/min_length": 1022.0, + "completions/min_terminated_length": 1022.0, + "entropy": 0.8444746807217598, + "epoch": 0.44342226310947563, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003038195427507162, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 424902953.0, + "reward": 0.359375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940037727356, + "sampling/importance_sampling_ratio/min": 7.431909580191132e-06, + "sampling/sampling_logp_difference/max": 11.809727668762207, + "sampling/sampling_logp_difference/mean": 0.019014043733477592, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.55851120666739e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.55851120666739e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14716.0, + "completions/mean_length": 6146.2109375, + "completions/mean_terminated_length": 6065.5986328125, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.8365580290555954, + "epoch": 0.44434222631094755, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025550283025950193, + "learning_rate": 1e-05, + "loss": 0.0548, + "num_tokens": 425709212.0, + "reward": 0.5625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000015497207642, + "sampling/importance_sampling_ratio/min": 0.0006884043687023222, + "sampling/sampling_logp_difference/max": 7.281134128570557, + "sampling/sampling_logp_difference/mean": 0.019193854182958603, + "step": 483 + }, + { + "clip_ratio/high_max": 2.4752349872869672e-05, + "clip_ratio/high_mean": 7.036488455014478e-06, + "clip_ratio/low_mean": 4.780410063176532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.484058920046664e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16153.0, + "completions/mean_length": 6557.578125, + "completions/mean_terminated_length": 6321.744140625, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.8316832035779953, + "epoch": 0.4452621895124195, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005126865580677986, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 426566462.0, + "reward": 0.484375, + "reward_std": 0.27852246165275574, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 2.7536634661373682e-05, + "sampling/sampling_logp_difference/max": 10.499993324279785, + "sampling/sampling_logp_difference/mean": 0.01839536987245083, + "step": 484 + }, + { + "clip_ratio/high_max": 3.443571449679439e-05, + "clip_ratio/high_mean": 8.608928624198597e-06, + "clip_ratio/low_mean": 5.915772453590762e-05, + "clip_ratio/low_min": 1.7084812043322017e-05, + "clip_ratio/region_mean": 6.776665304641938e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16359.0, + "completions/mean_length": 7007.3203125, + "completions/mean_terminated_length": 6858.484375, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8674142584204674, + "epoch": 0.44618215271389144, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004829525947570801, + "learning_rate": 1e-05, + "loss": 0.0753, + "num_tokens": 427480007.0, + "reward": 0.46875, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998922944068909, + "sampling/importance_sampling_ratio/min": 0.00020170137577224523, + "sampling/sampling_logp_difference/max": 8.508722305297852, + "sampling/sampling_logp_difference/mean": 0.019586069509387016, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.539863354897534e-05, + "clip_ratio/low_min": 8.211341992137022e-06, + "clip_ratio/region_mean": 5.539863354897534e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14748.0, + "completions/mean_length": 7069.8828125, + "completions/mean_terminated_length": 6922.0400390625, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.9066255167126656, + "epoch": 0.44710211591536336, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003539952216669917, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 428404968.0, + "reward": 0.5, + "reward_std": 0.3618982434272766, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 0.00024052867956925184, + "sampling/sampling_logp_difference/max": 8.332671165466309, + "sampling/sampling_logp_difference/mean": 0.020427238196134567, + "step": 486 + }, + { + "clip_ratio/high_max": 1.6550495729461545e-05, + "clip_ratio/high_mean": 4.137623932365386e-06, + "clip_ratio/low_mean": 5.576918465521885e-05, + "clip_ratio/low_min": 1.2613936178240692e-05, + "clip_ratio/region_mean": 5.99068093833921e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15290.0, + "completions/max_terminated_length": 15290.0, + "completions/mean_length": 5586.6875, + "completions/mean_terminated_length": 5586.6875, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.9208655655384064, + "epoch": 0.44802207911683534, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0030504625756293535, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 429137176.0, + "reward": 0.515625, + "reward_std": 0.3480040729045868, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999984502792358, + "sampling/importance_sampling_ratio/min": 0.0005498559912666678, + "sampling/sampling_logp_difference/max": 7.50585412979126, + "sampling/sampling_logp_difference/mean": 0.019396595656871796, + "step": 487 + }, + { + "clip_ratio/high_max": 3.3761509712348925e-05, + "clip_ratio/high_mean": 8.440377428087231e-06, + "clip_ratio/low_mean": 3.6384140912559815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.482451868170756e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15404.0, + "completions/mean_length": 5266.265625, + "completions/mean_terminated_length": 4999.4404296875, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.7884859293699265, + "epoch": 0.44894204231830726, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003902251599356532, + "learning_rate": 1e-05, + "loss": -0.0077, + "num_tokens": 429836026.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.05675617232918739, + "sampling/sampling_logp_difference/max": 2.868990898132324, + "sampling/sampling_logp_difference/mean": 0.01770034246146679, + "step": 488 + }, + { + "clip_ratio/high_max": 2.2323702978610527e-05, + "clip_ratio/high_mean": 5.580925744652632e-06, + "clip_ratio/low_mean": 4.0199149452746497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.578007497002545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6398.53125, + "completions/mean_terminated_length": 6319.9052734375, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "entropy": 0.8982341960072517, + "epoch": 0.44986200551977923, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024998660665005445, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 430673446.0, + "reward": 0.421875, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797940254211, + "sampling/importance_sampling_ratio/min": 0.000612784584518522, + "sampling/sampling_logp_difference/max": 7.397497177124023, + "sampling/sampling_logp_difference/mean": 0.020521972328424454, + "step": 489 + }, + { + "clip_ratio/high_max": 3.1756624366607866e-05, + "clip_ratio/high_mean": 7.939156091651967e-06, + "clip_ratio/low_mean": 8.124458963720826e-05, + "clip_ratio/low_min": 1.2379174222587608e-05, + "clip_ratio/region_mean": 8.91837471499457e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14374.0, + "completions/mean_length": 6277.65625, + "completions/mean_terminated_length": 6198.07861328125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8139145970344543, + "epoch": 0.45078196872125115, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00784115307033062, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 431497546.0, + "reward": 0.546875, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999848484992981, + "sampling/importance_sampling_ratio/min": 0.0006267798598855734, + "sampling/sampling_logp_difference/max": 7.37491512298584, + "sampling/sampling_logp_difference/mean": 0.01836184598505497, + "step": 490 + }, + { + "clip_ratio/high_max": 8.875004823494237e-06, + "clip_ratio/high_mean": 2.2187512058735592e-06, + "clip_ratio/low_mean": 2.3825880248296016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6044631454169576e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15903.0, + "completions/mean_length": 7708.59375, + "completions/mean_terminated_length": 7355.9345703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.087083138525486, + "epoch": 0.45170193192272307, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004277343396097422, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 432503414.0, + "reward": 0.2890625, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999503493309021, + "sampling/importance_sampling_ratio/min": 1.2187546417408157e-05, + "sampling/sampling_logp_difference/max": 11.315095901489258, + "sampling/sampling_logp_difference/mean": 0.02224145457148552, + "step": 491 + }, + { + "clip_ratio/high_max": 6.384065272868611e-06, + "clip_ratio/high_mean": 1.5960163182171527e-06, + "clip_ratio/low_mean": 3.561227788395627e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.720829374742607e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7162.7109375, + "completions/mean_terminated_length": 6865.25, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.9157010763883591, + "epoch": 0.45262189512419504, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006278311368077993, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 433439137.0, + "reward": 0.5078125, + "reward_std": 0.2227931171655655, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966561794281, + "sampling/importance_sampling_ratio/min": 0.0005532125360332429, + "sampling/sampling_logp_difference/max": 7.499768257141113, + "sampling/sampling_logp_difference/mean": 0.02123419940471649, + "step": 492 + }, + { + "clip_ratio/high_max": 2.846911434062349e-05, + "clip_ratio/high_mean": 8.656040449750435e-06, + "clip_ratio/low_mean": 5.1716241614485625e-05, + "clip_ratio/low_min": 3.601579010137357e-06, + "clip_ratio/region_mean": 6.037228104105452e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16123.0, + "completions/mean_length": 7388.90625, + "completions/mean_terminated_length": 7023.251953125, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "entropy": 0.7670486867427826, + "epoch": 0.45354185832566696, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005177734419703484, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 434402045.0, + "reward": 0.3828125, + "reward_std": 0.37951958179473877, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999250769615173, + "sampling/importance_sampling_ratio/min": 0.0022511729039251804, + "sampling/sampling_logp_difference/max": 6.096303939819336, + "sampling/sampling_logp_difference/mean": 0.01827731542289257, + "step": 493 + }, + { + "clip_ratio/high_max": 2.1548471977439476e-05, + "clip_ratio/high_mean": 6.257203722270788e-06, + "clip_ratio/low_mean": 7.719641234871233e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.345361538886209e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 6805.375, + "completions/mean_terminated_length": 6496.38671875, + "completions/min_length": 587.0, + "completions/min_terminated_length": 587.0, + "entropy": 0.8407405763864517, + "epoch": 0.45446182152713893, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032320048194378614, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 435292029.0, + "reward": 0.4296875, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999642372131348, + "sampling/importance_sampling_ratio/min": 6.679954094579443e-05, + "sampling/sampling_logp_difference/max": 9.613814353942871, + "sampling/sampling_logp_difference/mean": 0.018761277198791504, + "step": 494 + }, + { + "clip_ratio/high_max": 3.460495008766884e-06, + "clip_ratio/high_mean": 8.65123752191721e-07, + "clip_ratio/low_mean": 7.76378024056612e-05, + "clip_ratio/low_min": 1.7026316072588088e-05, + "clip_ratio/region_mean": 7.850292649891344e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15105.0, + "completions/mean_length": 5753.4140625, + "completions/mean_terminated_length": 5321.2763671875, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "entropy": 0.7848984077572823, + "epoch": 0.45538178472861085, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030854379292577505, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 436046842.0, + "reward": 0.578125, + "reward_std": 0.31405961513519287, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998626708984375, + "sampling/importance_sampling_ratio/min": 4.36544311810394e-09, + "sampling/sampling_logp_difference/max": 19.24954605102539, + "sampling/sampling_logp_difference/mean": 0.017733070999383926, + "step": 495 + }, + { + "clip_ratio/high_max": 1.7207588371093152e-05, + "clip_ratio/high_mean": 4.301897092773288e-06, + "clip_ratio/low_mean": 3.234025916754035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.664215591925313e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6522.84375, + "completions/mean_terminated_length": 6445.19677734375, + "completions/min_length": 1062.0, + "completions/min_terminated_length": 1062.0, + "entropy": 1.0593653172254562, + "epoch": 0.4563017479300828, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003124243812635541, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 436899638.0, + "reward": 0.4140625, + "reward_std": 0.2706219553947449, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999418258666992, + "sampling/importance_sampling_ratio/min": 4.476920821616659e-06, + "sampling/sampling_logp_difference/max": 12.316575050354004, + "sampling/sampling_logp_difference/mean": 0.021180003881454468, + "step": 496 + }, + { + "clip_ratio/high_max": 1.1790433973146719e-05, + "clip_ratio/high_mean": 2.9476084932866797e-06, + "clip_ratio/low_mean": 2.8437304308681632e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.138491274512489e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14515.0, + "completions/mean_length": 6203.203125, + "completions/mean_terminated_length": 5874.7900390625, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.8152795508503914, + "epoch": 0.45722171113155474, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005001795012503862, + "learning_rate": 1e-05, + "loss": 0.0817, + "num_tokens": 437713008.0, + "reward": 0.4296875, + "reward_std": 0.26143795251846313, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101758003235, + "sampling/importance_sampling_ratio/min": 0.001757707679644227, + "sampling/sampling_logp_difference/max": 6.34374475479126, + "sampling/sampling_logp_difference/mean": 0.017751028761267662, + "step": 497 + }, + { + "clip_ratio/high_max": 1.3163793028070359e-05, + "clip_ratio/high_mean": 4.229499381835922e-06, + "clip_ratio/low_mean": 4.4599403963729856e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.882890357293945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15423.0, + "completions/mean_length": 5975.5234375, + "completions/mean_terminated_length": 5725.72021484375, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "entropy": 0.8275932744145393, + "epoch": 0.45814167433302666, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005084732081741095, + "learning_rate": 1e-05, + "loss": 0.0759, + "num_tokens": 438495811.0, + "reward": 0.5390625, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998699426651001, + "sampling/importance_sampling_ratio/min": 3.120788460364565e-05, + "sampling/sampling_logp_difference/max": 10.374839782714844, + "sampling/sampling_logp_difference/mean": 0.018671832978725433, + "step": 498 + }, + { + "clip_ratio/high_max": 3.229640242352616e-06, + "clip_ratio/high_mean": 8.07410060588154e-07, + "clip_ratio/low_mean": 3.0413870263146237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1221280551108066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 7019.59375, + "completions/mean_terminated_length": 7019.59375, + "completions/min_length": 1058.0, + "completions/min_terminated_length": 1058.0, + "entropy": 0.9266618490219116, + "epoch": 0.45906163753449863, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002567912917584181, + "learning_rate": 1e-05, + "loss": 0.0282, + "num_tokens": 439413055.0, + "reward": 0.375, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000476837158203, + "sampling/importance_sampling_ratio/min": 0.0010315657127648592, + "sampling/sampling_logp_difference/max": 6.876677513122559, + "sampling/sampling_logp_difference/mean": 0.02012534812092781, + "step": 499 + }, + { + "clip_ratio/high_max": 1.8327779343962902e-05, + "clip_ratio/high_mean": 4.5819448359907256e-06, + "clip_ratio/low_mean": 4.08189575864526e-05, + "clip_ratio/low_min": 4.041122338094283e-06, + "clip_ratio/region_mean": 4.5400901854009135e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7373.3203125, + "completions/mean_terminated_length": 7082.65283203125, + "completions/min_length": 854.0, + "completions/min_terminated_length": 854.0, + "entropy": 0.9383682310581207, + "epoch": 0.45998160073597055, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004862098954617977, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 440375128.0, + "reward": 0.4375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.0006883886526338756, + "sampling/sampling_logp_difference/max": 7.28115701675415, + "sampling/sampling_logp_difference/mean": 0.020596595481038094, + "step": 500 + }, + { + "clip_ratio/high_max": 1.650619151405408e-05, + "clip_ratio/high_mean": 4.12654787851352e-06, + "clip_ratio/low_mean": 6.364750265674957e-05, + "clip_ratio/low_min": 3.94595599573222e-06, + "clip_ratio/region_mean": 6.77740499668289e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16280.0, + "completions/mean_length": 5944.953125, + "completions/mean_terminated_length": 5862.755859375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "entropy": 0.9130716845393181, + "epoch": 0.4609015639374425, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041388699784875, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 441156306.0, + "reward": 0.3984375, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999566078186035, + "sampling/importance_sampling_ratio/min": 0.0007685241289436817, + "sampling/sampling_logp_difference/max": 7.171038627624512, + "sampling/sampling_logp_difference/mean": 0.019817989319562912, + "step": 501 + }, + { + "clip_ratio/high_max": 2.9951792839710834e-05, + "clip_ratio/high_mean": 9.205811807078135e-06, + "clip_ratio/low_mean": 3.147234815514821e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0678160075913183e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16181.0, + "completions/mean_length": 6686.015625, + "completions/mean_terminated_length": 6609.6533203125, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 0.8640913739800453, + "epoch": 0.46182152713891444, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005679543130099773, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 442032972.0, + "reward": 0.5546875, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 0.007731473073363304, + "sampling/sampling_logp_difference/max": 4.86245584487915, + "sampling/sampling_logp_difference/mean": 0.019738182425498962, + "step": 502 + }, + { + "clip_ratio/high_max": 3.0190597726686974e-05, + "clip_ratio/high_mean": 7.5476494316717435e-06, + "clip_ratio/low_mean": 3.858067566397949e-05, + "clip_ratio/low_min": 9.290916750614997e-06, + "clip_ratio/region_mean": 4.612832617567619e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 6945.5, + "completions/mean_terminated_length": 6231.6640625, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.8156519457697868, + "epoch": 0.46274149034038636, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006176612339913845, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 442940940.0, + "reward": 0.46875, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999117851257324, + "sampling/importance_sampling_ratio/min": 0.00018278000061400235, + "sampling/sampling_logp_difference/max": 8.607227325439453, + "sampling/sampling_logp_difference/mean": 0.01836501806974411, + "step": 503 + }, + { + "clip_ratio/high_max": 2.2105000425653998e-05, + "clip_ratio/high_mean": 6.28071654773521e-06, + "clip_ratio/low_mean": 3.060894187001395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6889658531436e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15847.0, + "completions/mean_length": 8068.5390625, + "completions/mean_terminated_length": 7363.8388671875, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "entropy": 0.8196670189499855, + "epoch": 0.46366145354185834, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021770994644612074, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 443992041.0, + "reward": 0.4453125, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999759197235107, + "sampling/importance_sampling_ratio/min": 0.0001795605494407937, + "sampling/sampling_logp_difference/max": 8.624998092651367, + "sampling/sampling_logp_difference/mean": 0.019003838300704956, + "step": 504 + }, + { + "clip_ratio/high_max": 1.287241002501105e-05, + "clip_ratio/high_mean": 3.2181025062527624e-06, + "clip_ratio/low_mean": 4.5685408849749365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.89035115833758e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15168.0, + "completions/mean_length": 5209.140625, + "completions/mean_terminated_length": 5031.76220703125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.8851845487952232, + "epoch": 0.46458141674333026, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00788798462599516, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 444679675.0, + "reward": 0.4609375, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796748161316, + "sampling/importance_sampling_ratio/min": 0.00025673024356365204, + "sampling/sampling_logp_difference/max": 8.267484664916992, + "sampling/sampling_logp_difference/mean": 0.018808994442224503, + "step": 505 + }, + { + "clip_ratio/high_max": 2.294301202709903e-05, + "clip_ratio/high_mean": 6.590465602585027e-06, + "clip_ratio/low_mean": 5.944662643742049e-05, + "clip_ratio/low_min": 8.106994755507912e-06, + "clip_ratio/region_mean": 6.603709243790945e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16259.0, + "completions/mean_length": 7558.8984375, + "completions/mean_terminated_length": 7274.21728515625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.003449946641922, + "epoch": 0.46550137994480223, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004547314252704382, + "learning_rate": 1e-05, + "loss": 0.1586, + "num_tokens": 445668126.0, + "reward": 0.421875, + "reward_std": 0.42293959856033325, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999848484992981, + "sampling/importance_sampling_ratio/min": 0.00011622780584730208, + "sampling/sampling_logp_difference/max": 9.059958457946777, + "sampling/sampling_logp_difference/mean": 0.02099413052201271, + "step": 506 + }, + { + "clip_ratio/high_max": 2.1350435872591333e-05, + "clip_ratio/high_mean": 6.047981628398702e-06, + "clip_ratio/low_mean": 8.880347786544007e-05, + "clip_ratio/low_min": 9.06585455595632e-06, + "clip_ratio/region_mean": 9.485145938015194e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16137.0, + "completions/max_terminated_length": 16137.0, + "completions/mean_length": 6066.6015625, + "completions/mean_terminated_length": 6066.6015625, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "entropy": 0.8450648710131645, + "epoch": 0.46642134314627415, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004621773958206177, + "learning_rate": 1e-05, + "loss": 0.121, + "num_tokens": 446464587.0, + "reward": 0.5390625, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000154972076416, + "sampling/importance_sampling_ratio/min": 1.3950601896794979e-05, + "sampling/sampling_logp_difference/max": 11.179987907409668, + "sampling/sampling_logp_difference/mean": 0.018016980960965157, + "step": 507 + }, + { + "clip_ratio/high_max": 3.0534724828612525e-06, + "clip_ratio/high_mean": 7.633681207153131e-07, + "clip_ratio/low_mean": 2.149350007130124e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2256868305703392e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 6988.0234375, + "completions/mean_terminated_length": 6838.88134765625, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 1.0452716201543808, + "epoch": 0.46734130634774607, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004523546434938908, + "learning_rate": 1e-05, + "loss": 0.0396, + "num_tokens": 447381134.0, + "reward": 0.3515625, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999901056289673, + "sampling/importance_sampling_ratio/min": 0.016167031601071358, + "sampling/sampling_logp_difference/max": 4.124781131744385, + "sampling/sampling_logp_difference/mean": 0.021812722086906433, + "step": 508 + }, + { + "clip_ratio/high_max": 5.58759120394825e-06, + "clip_ratio/high_mean": 1.3968978009870625e-06, + "clip_ratio/low_mean": 3.684896307731833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.824586099199223e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12316.0, + "completions/max_terminated_length": 12316.0, + "completions/mean_length": 5948.5, + "completions/mean_terminated_length": 5948.5, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8241566568613052, + "epoch": 0.46826126954921804, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004002885892987251, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 448158014.0, + "reward": 0.5703125, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 0.0008566387114115059, + "sampling/sampling_logp_difference/max": 7.062494277954102, + "sampling/sampling_logp_difference/mean": 0.018487900495529175, + "step": 509 + }, + { + "clip_ratio/high_max": 1.0490723752809572e-05, + "clip_ratio/high_mean": 3.439610338773491e-06, + "clip_ratio/low_mean": 3.973086239739132e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3170473020381905e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16044.0, + "completions/mean_length": 7966.375, + "completions/mean_terminated_length": 7764.3525390625, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.8868448063731194, + "epoch": 0.46918123275068996, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019062751671299338, + "learning_rate": 1e-05, + "loss": 0.0787, + "num_tokens": 449197054.0, + "reward": 0.40625, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0001614262000657618, + "sampling/sampling_logp_difference/max": 8.731462478637695, + "sampling/sampling_logp_difference/mean": 0.020015282556414604, + "step": 510 + }, + { + "clip_ratio/high_max": 1.2195105682621943e-05, + "clip_ratio/high_mean": 3.0487764206554857e-06, + "clip_ratio/low_mean": 3.558348203114292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8632259474979946e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 6520.0234375, + "completions/mean_terminated_length": 6442.3544921875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9168323278427124, + "epoch": 0.47010119595216193, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00490277074277401, + "learning_rate": 1e-05, + "loss": 0.0547, + "num_tokens": 450050153.0, + "reward": 0.484375, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 4.4418397919798736e-06, + "sampling/sampling_logp_difference/max": 12.324441909790039, + "sampling/sampling_logp_difference/mean": 0.020178331062197685, + "step": 511 + }, + { + "clip_ratio/high_max": 7.95772848505294e-06, + "clip_ratio/high_mean": 1.989432121263235e-06, + "clip_ratio/low_mean": 3.363800146871654e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.562743381735345e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 6614.5625, + "completions/mean_terminated_length": 6217.4306640625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.8635925352573395, + "epoch": 0.47102115915363385, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003792276605963707, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 450915281.0, + "reward": 0.5, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999154806137085, + "sampling/importance_sampling_ratio/min": 0.004489119164645672, + "sampling/sampling_logp_difference/max": 5.40609884262085, + "sampling/sampling_logp_difference/mean": 0.019233014434576035, + "step": 512 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 450915281, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-512/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-512/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-512/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/README.md b/dapo_milora_plus_20251201_131939/checkpoint-576/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-576/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-576/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-576/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/latest b/dapo_milora_plus_20251201_131939/checkpoint-576/latest new file mode 100644 index 0000000000000000000000000000000000000000..1a40031386820b60f3a54acbdbae4813e4a986c7 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-576/latest @@ -0,0 +1 @@ +global_step576 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-576/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-576/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-576/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-64/latest b/dapo_milora_plus_20251201_131939/checkpoint-64/latest new file mode 100644 index 0000000000000000000000000000000000000000..4a12e7f9029554e8e5ce68ebe3e97d0b4e734304 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-64/latest @@ -0,0 +1 @@ +global_step64 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-64/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-64/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-64/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/README.md b/dapo_milora_plus_20251201_131939/checkpoint-640/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-640/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-640/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-640/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/latest b/dapo_milora_plus_20251201_131939/checkpoint-640/latest new file mode 100644 index 0000000000000000000000000000000000000000..4a8906aefa3405aec9d51931707431ef44f4dace --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-640/latest @@ -0,0 +1 @@ +global_step640 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-640/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-640/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-640/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f57c3a28876bdc73fe6f5aa88ea5d533caac1336 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-640/trainer_state.json @@ -0,0 +1,19874 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5887764489420423, + "eval_steps": 500, + "global_step": 640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + }, + { + "clip_ratio/high_max": 6.87833608026267e-06, + "clip_ratio/high_mean": 2.9462287329806713e-06, + "clip_ratio/low_mean": 5.435333650893881e-05, + "clip_ratio/low_min": 5.33937054569833e-06, + "clip_ratio/region_mean": 5.729956546929316e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 6448.0078125, + "completions/mean_terminated_length": 6369.771484375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9546648040413857, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004310046322643757, + "learning_rate": 1e-05, + "loss": 0.1082, + "num_tokens": 220304605.0, + "reward": 0.5703125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 0.0001234127557836473, + "sampling/sampling_logp_difference/max": 8.99997615814209, + "sampling/sampling_logp_difference/mean": 0.020253397524356842, + "step": 257 + }, + { + "clip_ratio/high_max": 6.196094091137638e-06, + "clip_ratio/high_mean": 1.5490235227844096e-06, + "clip_ratio/low_mean": 2.5416685957679874e-05, + "clip_ratio/low_min": 5.5736391004757024e-06, + "clip_ratio/region_mean": 2.696570959415112e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 7457.6484375, + "completions/mean_terminated_length": 6941.24755859375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "entropy": 0.8182889074087143, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026646999176591635, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 221281968.0, + "reward": 0.4453125, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173283576965, + "sampling/importance_sampling_ratio/min": 2.902353571698768e-06, + "sampling/sampling_logp_difference/max": 12.749988555908203, + "sampling/sampling_logp_difference/mean": 0.019208962097764015, + "step": 258 + }, + { + "clip_ratio/high_max": 1.6189535017474554e-05, + "clip_ratio/high_mean": 4.047383754368639e-06, + "clip_ratio/low_mean": 3.127787306311802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.532525670379982e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8561.109375, + "completions/mean_terminated_length": 7969.79052734375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.9581378549337387, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016026750672608614, + "learning_rate": 1e-05, + "loss": 0.0131, + "num_tokens": 222399046.0, + "reward": 0.34375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 1.653693971093162e-06, + "sampling/sampling_logp_difference/max": 13.312499046325684, + "sampling/sampling_logp_difference/mean": 0.02173236384987831, + "step": 259 + }, + { + "clip_ratio/high_max": 1.4200771602190798e-05, + "clip_ratio/high_mean": 4.3255887476334465e-06, + "clip_ratio/low_mean": 5.2955770115659107e-05, + "clip_ratio/low_min": 3.402656830076012e-06, + "clip_ratio/region_mean": 5.7281358749605715e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16239.0, + "completions/mean_length": 7152.34375, + "completions/mean_terminated_length": 7079.6533203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9052041247487068, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005460259038954973, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 223335010.0, + "reward": 0.4296875, + "reward_std": 0.3356297016143799, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966621398926, + "sampling/importance_sampling_ratio/min": 0.010161337442696095, + "sampling/sampling_logp_difference/max": 4.589165210723877, + "sampling/sampling_logp_difference/mean": 0.01986619457602501, + "step": 260 + }, + { + "clip_ratio/high_max": 1.4350314813782461e-05, + "clip_ratio/high_mean": 3.5875787034456152e-06, + "clip_ratio/low_mean": 3.81288905373367e-05, + "clip_ratio/low_min": 8.099272235995159e-06, + "clip_ratio/region_mean": 4.1716469809216505e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 6678.65625, + "completions/mean_terminated_length": 6524.603515625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.9043187350034714, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005933742038905621, + "learning_rate": 1e-05, + "loss": 0.0966, + "num_tokens": 224207006.0, + "reward": 0.484375, + "reward_std": 0.3316681981086731, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000031590461731, + "sampling/importance_sampling_ratio/min": 0.0011734943836927414, + "sampling/sampling_logp_difference/max": 6.747769355773926, + "sampling/sampling_logp_difference/mean": 0.019827336072921753, + "step": 261 + }, + { + "clip_ratio/high_max": 1.6498819377375185e-05, + "clip_ratio/high_mean": 4.124704844343796e-06, + "clip_ratio/low_mean": 3.601791678420341e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014262168539062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6999.0390625, + "completions/mean_terminated_length": 6850.07177734375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8109970837831497, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003635740838944912, + "learning_rate": 1e-05, + "loss": 0.104, + "num_tokens": 225122891.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303817749023, + "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05, + "sampling/sampling_logp_difference/max": 10.987512588500977, + "sampling/sampling_logp_difference/mean": 0.018912551924586296, + "step": 262 + }, + { + "clip_ratio/high_max": 9.527577958579059e-06, + "clip_ratio/high_mean": 2.3818944896447647e-06, + "clip_ratio/low_mean": 3.766565987461945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004755419373396e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7483.7109375, + "completions/mean_terminated_length": 7045.9912109375, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "entropy": 0.9473970532417297, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003405241761356592, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 226102462.0, + "reward": 0.4453125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002920627594, + "sampling/importance_sampling_ratio/min": 0.00525119062513113, + "sampling/sampling_logp_difference/max": 5.249300479888916, + "sampling/sampling_logp_difference/mean": 0.021076779812574387, + "step": 263 + }, + { + "clip_ratio/high_max": 1.5867321963014547e-05, + "clip_ratio/high_mean": 3.966830490753637e-06, + "clip_ratio/low_mean": 3.8259706570897833e-05, + "clip_ratio/low_min": 3.549019083948224e-06, + "clip_ratio/region_mean": 4.2226537743772496e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 7569.03125, + "completions/mean_terminated_length": 7357.47216796875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9231455475091934, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025927501264959574, + "learning_rate": 1e-05, + "loss": 0.0801, + "num_tokens": 227093562.0, + "reward": 0.3984375, + "reward_std": 0.19097033143043518, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0052477638237178326, + "sampling/sampling_logp_difference/max": 5.249953269958496, + "sampling/sampling_logp_difference/mean": 0.020578444004058838, + "step": 264 + }, + { + "clip_ratio/high_max": 1.344091060673236e-05, + "clip_ratio/high_mean": 3.36022765168309e-06, + "clip_ratio/low_mean": 4.253613235505327e-05, + "clip_ratio/low_min": 3.5579084851633525e-06, + "clip_ratio/region_mean": 4.5896360120423196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 7589.2734375, + "completions/mean_terminated_length": 7378.2001953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9265239909291267, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030512227676808834, + "learning_rate": 1e-05, + "loss": 0.04, + "num_tokens": 228086405.0, + "reward": 0.4296875, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0002165911573683843, + "sampling/sampling_logp_difference/max": 8.437499046325684, + "sampling/sampling_logp_difference/mean": 0.020208362489938736, + "step": 265 + }, + { + "clip_ratio/high_max": 1.9613525410022703e-05, + "clip_ratio/high_mean": 4.903381352505676e-06, + "clip_ratio/low_mean": 3.184792547017423e-05, + "clip_ratio/low_min": 7.29296516510658e-06, + "clip_ratio/region_mean": 3.675130722058384e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 8420.6875, + "completions/mean_terminated_length": 8096.97509765625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "entropy": 0.9572964608669281, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022430522367358208, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 229183765.0, + "reward": 0.34375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 0.00029693738906644285, + "sampling/sampling_logp_difference/max": 8.121989250183105, + "sampling/sampling_logp_difference/mean": 0.021570362150669098, + "step": 266 + }, + { + "clip_ratio/high_max": 6.728750577167375e-06, + "clip_ratio/high_mean": 1.6821876442918438e-06, + "clip_ratio/low_mean": 2.1682553096979973e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.336474062758498e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15736.0, + "completions/mean_length": 6809.765625, + "completions/mean_terminated_length": 6579.984375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.884086549282074, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004295065999031067, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 230077607.0, + "reward": 0.484375, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00754612497985363, + "sampling/sampling_logp_difference/max": 4.886721134185791, + "sampling/sampling_logp_difference/mean": 0.019895706325769424, + "step": 267 + }, + { + "clip_ratio/high_max": 2.8609347509700456e-05, + "clip_ratio/high_mean": 7.152336877425114e-06, + "clip_ratio/low_mean": 5.158006410965754e-05, + "clip_ratio/low_min": 5.210069957684027e-06, + "clip_ratio/region_mean": 5.873240070286556e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15080.0, + "completions/mean_length": 7340.6953125, + "completions/mean_terminated_length": 6973.0810546875, + "completions/min_length": 1616.0, + "completions/min_terminated_length": 1616.0, + "entropy": 0.9920620769262314, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004631794057786465, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 231035616.0, + "reward": 0.4375, + "reward_std": 0.3235401213169098, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.0002508950710762292, + "sampling/sampling_logp_difference/max": 8.290475845336914, + "sampling/sampling_logp_difference/mean": 0.020591016858816147, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.3085940774290066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3085940774290066e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14120.0, + "completions/mean_length": 6748.875, + "completions/mean_terminated_length": 6595.93701171875, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.9867061004042625, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035752104595303535, + "learning_rate": 1e-05, + "loss": 0.0455, + "num_tokens": 231920056.0, + "reward": 0.40625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.0003869794018100947, + "sampling/sampling_logp_difference/max": 7.8571391105651855, + "sampling/sampling_logp_difference/mean": 0.02061416581273079, + "step": 269 + }, + { + "clip_ratio/high_max": 1.2506750408647349e-05, + "clip_ratio/high_mean": 3.1266876021618373e-06, + "clip_ratio/low_mean": 3.10397430212106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.416643085074611e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 7260.3046875, + "completions/mean_terminated_length": 7188.46435546875, + "completions/min_length": 1384.0, + "completions/min_terminated_length": 1384.0, + "entropy": 1.0388494208455086, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036644963547587395, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 232869159.0, + "reward": 0.390625, + "reward_std": 0.2359209954738617, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999546408653259, + "sampling/importance_sampling_ratio/min": 0.0008660226594656706, + "sampling/sampling_logp_difference/max": 7.051599502563477, + "sampling/sampling_logp_difference/mean": 0.02120530977845192, + "step": 270 + }, + { + "clip_ratio/high_max": 2.704355301830219e-05, + "clip_ratio/high_mean": 6.760888254575548e-06, + "clip_ratio/low_mean": 3.1861192269388994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862208097871189e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16073.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 6354.4609375, + "completions/mean_terminated_length": 6354.4609375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "entropy": 0.8405331820249557, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004709267523139715, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 233702842.0, + "reward": 0.546875, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 0.0046309432946145535, + "sampling/sampling_logp_difference/max": 5.37499475479126, + "sampling/sampling_logp_difference/mean": 0.019126038998365402, + "step": 271 + }, + { + "clip_ratio/high_max": 9.749228638611385e-06, + "clip_ratio/high_mean": 2.437307159652846e-06, + "clip_ratio/low_mean": 3.855073941849696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.098804652130639e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6514.578125, + "completions/mean_terminated_length": 6357.9208984375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 1.0254098922014236, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003066045930609107, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 234556348.0, + "reward": 0.4375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 0.005210204049944878, + "sampling/sampling_logp_difference/max": 5.257136344909668, + "sampling/sampling_logp_difference/mean": 0.019960148259997368, + "step": 272 + }, + { + "clip_ratio/high_max": 1.0475813724042382e-05, + "clip_ratio/high_mean": 2.6189534310105955e-06, + "clip_ratio/low_mean": 3.487835761006863e-05, + "clip_ratio/low_min": 2.9392399483185727e-06, + "clip_ratio/region_mean": 3.749731081370555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 7379.5546875, + "completions/mean_terminated_length": 7236.62744140625, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 1.0397320613265038, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005132520105689764, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 235521091.0, + "reward": 0.2890625, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999256134033203, + "sampling/importance_sampling_ratio/min": 0.00016659013635944575, + "sampling/sampling_logp_difference/max": 8.699974060058594, + "sampling/sampling_logp_difference/mean": 0.021417103707790375, + "step": 273 + }, + { + "clip_ratio/high_max": 1.9904123973901733e-05, + "clip_ratio/high_mean": 5.776861314643611e-06, + "clip_ratio/low_mean": 2.6659268655748747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2436129686175263e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 7837.1640625, + "completions/mean_terminated_length": 7632.04052734375, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "entropy": 0.8400963917374611, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028969801496714354, + "learning_rate": 1e-05, + "loss": 0.0143, + "num_tokens": 236544160.0, + "reward": 0.3828125, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887943267822, + "sampling/importance_sampling_ratio/min": 2.883308241052873e-07, + "sampling/sampling_logp_difference/max": 15.059157371520996, + "sampling/sampling_logp_difference/mean": 0.019267702475190163, + "step": 274 + }, + { + "clip_ratio/high_max": 8.562770290154731e-06, + "clip_ratio/high_mean": 2.1406925725386827e-06, + "clip_ratio/low_mean": 4.060094340729847e-05, + "clip_ratio/low_min": 3.8700886761944275e-06, + "clip_ratio/region_mean": 4.2741635979837156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15350.0, + "completions/mean_length": 6696.3515625, + "completions/mean_terminated_length": 6542.57958984375, + "completions/min_length": 1239.0, + "completions/min_terminated_length": 1239.0, + "entropy": 0.8495818004012108, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003412836929783225, + "learning_rate": 1e-05, + "loss": 0.0803, + "num_tokens": 237423101.0, + "reward": 0.515625, + "reward_std": 0.37981897592544556, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.012152798473834991, + "sampling/sampling_logp_difference/max": 4.410195827484131, + "sampling/sampling_logp_difference/mean": 0.018458625301718712, + "step": 275 + }, + { + "clip_ratio/high_max": 1.1463653436294408e-05, + "clip_ratio/high_mean": 3.646129641765583e-06, + "clip_ratio/low_mean": 6.144847083078275e-05, + "clip_ratio/low_min": 1.110105540647055e-05, + "clip_ratio/region_mean": 6.509460160941671e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15666.0, + "completions/mean_length": 7700.3671875, + "completions/mean_terminated_length": 7121.45849609375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.8258870914578438, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024443145375698805, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 238429956.0, + "reward": 0.375, + "reward_std": 0.2872493863105774, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999113082885742, + "sampling/importance_sampling_ratio/min": 0.00026112530031241477, + "sampling/sampling_logp_difference/max": 8.250510215759277, + "sampling/sampling_logp_difference/mean": 0.019427984952926636, + "step": 276 + }, + { + "clip_ratio/high_max": 4.218127742205979e-06, + "clip_ratio/high_mean": 1.0545319355514948e-06, + "clip_ratio/low_mean": 1.7289162997258245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.834369493280974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16112.0, + "completions/mean_length": 6255.21875, + "completions/mean_terminated_length": 6094.44482421875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.8179014846682549, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022747826296836138, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 239250160.0, + "reward": 0.5234375, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.0002633975527714938, + "sampling/sampling_logp_difference/max": 8.241846084594727, + "sampling/sampling_logp_difference/mean": 0.018723051995038986, + "step": 277 + }, + { + "clip_ratio/high_max": 1.698448841125355e-05, + "clip_ratio/high_mean": 5.369374321162468e-06, + "clip_ratio/low_mean": 6.14647315160255e-05, + "clip_ratio/low_min": 5.043576493335422e-06, + "clip_ratio/region_mean": 6.683410583718796e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15321.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6914.9609375, + "completions/mean_terminated_length": 6914.9609375, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9700981751084328, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005685295443981886, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 240156211.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05, + "sampling/sampling_logp_difference/max": 9.997581481933594, + "sampling/sampling_logp_difference/mean": 0.021195171400904655, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9186837764427764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9186837764427764e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15469.0, + "completions/mean_length": 5227.53125, + "completions/mean_terminated_length": 5139.68505859375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9116031974554062, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003880272386595607, + "learning_rate": 1e-05, + "loss": 0.1246, + "num_tokens": 240845295.0, + "reward": 0.6328125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000362396240234, + "sampling/importance_sampling_ratio/min": 0.00012422871077433228, + "sampling/sampling_logp_difference/max": 8.993386268615723, + "sampling/sampling_logp_difference/mean": 0.018801718950271606, + "step": 279 + }, + { + "clip_ratio/high_max": 2.5015486926349695e-05, + "clip_ratio/high_mean": 8.084949570275057e-06, + "clip_ratio/low_mean": 5.524710468307603e-05, + "clip_ratio/low_min": 3.776891389861703e-06, + "clip_ratio/region_mean": 6.333205465125502e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 8065.4765625, + "completions/mean_terminated_length": 7510.90869140625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.7446574792265892, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028986844699829817, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 241895676.0, + "reward": 0.4921875, + "reward_std": 0.3474721610546112, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.0017039099475368857, + "sampling/sampling_logp_difference/max": 6.3748297691345215, + "sampling/sampling_logp_difference/mean": 0.01853121444582939, + "step": 280 + }, + { + "clip_ratio/high_max": 9.486341014053323e-06, + "clip_ratio/high_mean": 2.371585253513331e-06, + "clip_ratio/low_mean": 2.896106741445692e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133265261112683e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15534.0, + "completions/max_terminated_length": 15534.0, + "completions/mean_length": 6127.359375, + "completions/mean_terminated_length": 6127.359375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.8569132760167122, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003845847910270095, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 242698258.0, + "reward": 0.53125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000942945480347, + "sampling/importance_sampling_ratio/min": 0.00043231461313553154, + "sampling/sampling_logp_difference/max": 7.746356964111328, + "sampling/sampling_logp_difference/mean": 0.01856958493590355, + "step": 281 + }, + { + "clip_ratio/high_max": 2.9848330086679198e-05, + "clip_ratio/high_mean": 7.4620825216697995e-06, + "clip_ratio/low_mean": 4.3558867673709756e-05, + "clip_ratio/low_min": 4.417741820361698e-06, + "clip_ratio/region_mean": 5.1020949285884853e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15192.0, + "completions/mean_length": 6600.1484375, + "completions/mean_terminated_length": 6365.33642578125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.78924310952425, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003953634761273861, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 243560957.0, + "reward": 0.5546875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.0006525487406179309, + "sampling/sampling_logp_difference/max": 7.334624767303467, + "sampling/sampling_logp_difference/mean": 0.018097909167408943, + "step": 282 + }, + { + "clip_ratio/high_max": 6.635561703660642e-06, + "clip_ratio/high_mean": 1.6588904259151604e-06, + "clip_ratio/low_mean": 2.737523408313791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9034124281679397e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7852.171875, + "completions/mean_terminated_length": 7852.171875, + "completions/min_length": 1276.0, + "completions/min_terminated_length": 1276.0, + "entropy": 1.0598893761634827, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00360781978815794, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 244585923.0, + "reward": 0.3125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05, + "sampling/sampling_logp_difference/max": 10.076086044311523, + "sampling/sampling_logp_difference/mean": 0.022330068051815033, + "step": 283 + }, + { + "clip_ratio/high_max": 3.1540168947685743e-06, + "clip_ratio/high_mean": 7.885042236921436e-07, + "clip_ratio/low_mean": 4.7973388973332476e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.876189268543385e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7972.2265625, + "completions/mean_terminated_length": 7700.87890625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.933217465877533, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0027661293279379606, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 245628064.0, + "reward": 0.28125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05, + "sampling/sampling_logp_difference/max": 10.366576194763184, + "sampling/sampling_logp_difference/mean": 0.021125148981809616, + "step": 284 + }, + { + "clip_ratio/high_max": 1.2965969062861404e-05, + "clip_ratio/high_mean": 3.241492265715351e-06, + "clip_ratio/low_mean": 4.6317693090713874e-05, + "clip_ratio/low_min": 3.820877282123547e-06, + "clip_ratio/region_mean": 4.955918507221213e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7135.6953125, + "completions/mean_terminated_length": 6913.736328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.7786942347884178, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005680318456143141, + "learning_rate": 1e-05, + "loss": 0.0786, + "num_tokens": 246561329.0, + "reward": 0.4296875, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462366104126, + "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05, + "sampling/sampling_logp_difference/max": 9.737424850463867, + "sampling/sampling_logp_difference/mean": 0.018504241481423378, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.22437145175536e-05, + "clip_ratio/low_min": 1.4025082009538892e-05, + "clip_ratio/region_mean": 4.22437145175536e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6704.046875, + "completions/mean_terminated_length": 6627.82666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 1.0435140281915665, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026402862276881933, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 247437415.0, + "reward": 0.3828125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.0007800163584761322, + "sampling/sampling_logp_difference/max": 7.156195640563965, + "sampling/sampling_logp_difference/mean": 0.02134273201227188, + "step": 286 + }, + { + "clip_ratio/high_max": 2.223430897174694e-05, + "clip_ratio/high_mean": 6.8746438159905665e-06, + "clip_ratio/low_mean": 4.7084630978133646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3959275192028144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 5892.5078125, + "completions/mean_terminated_length": 5725.9765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.8004944771528244, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003993614576756954, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 248211112.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0024652592837810516, + "sampling/sampling_logp_difference/max": 6.005458354949951, + "sampling/sampling_logp_difference/mean": 0.01924925297498703, + "step": 287 + }, + { + "clip_ratio/high_max": 2.1833082200828358e-05, + "clip_ratio/high_mean": 5.458270550207089e-06, + "clip_ratio/low_mean": 3.415995615796419e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961822596920683e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 7812.140625, + "completions/mean_terminated_length": 7316.24755859375, + "completions/min_length": 1515.0, + "completions/min_terminated_length": 1515.0, + "entropy": 0.8841542899608612, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001573400106281042, + "learning_rate": 1e-05, + "loss": 0.0823, + "num_tokens": 249228106.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 0.001001527882181108, + "sampling/sampling_logp_difference/max": 6.906228542327881, + "sampling/sampling_logp_difference/mean": 0.01956877112388611, + "step": 288 + }, + { + "clip_ratio/high_max": 1.014439021673752e-05, + "clip_ratio/high_mean": 2.53609755418438e-06, + "clip_ratio/low_mean": 3.068193461785995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.321803217204433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 6372.953125, + "completions/mean_terminated_length": 6132.6884765625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.8228401988744736, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021125099156051874, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 250063284.0, + "reward": 0.5, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05, + "sampling/sampling_logp_difference/max": 9.937475204467773, + "sampling/sampling_logp_difference/mean": 0.01943521574139595, + "step": 289 + }, + { + "clip_ratio/high_max": 7.023906164249638e-06, + "clip_ratio/high_mean": 1.7559765410624095e-06, + "clip_ratio/low_mean": 2.526416994896863e-05, + "clip_ratio/low_min": 6.7760895490209805e-06, + "clip_ratio/region_mean": 2.7020146660561295e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16270.0, + "completions/mean_length": 7817.8671875, + "completions/mean_terminated_length": 7396.58154296875, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.9454319775104523, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022315154783427715, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 251085123.0, + "reward": 0.40625, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06, + "sampling/sampling_logp_difference/max": 12.760490417480469, + "sampling/sampling_logp_difference/mean": 0.021764669567346573, + "step": 290 + }, + { + "clip_ratio/high_max": 1.4797966287005693e-05, + "clip_ratio/high_mean": 3.699491571751423e-06, + "clip_ratio/low_mean": 4.36271948274225e-05, + "clip_ratio/low_min": 3.6957101201551268e-06, + "clip_ratio/region_mean": 4.732668639917392e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 7168.4921875, + "completions/mean_terminated_length": 6635.36328125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8433891162276268, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 252020906.0, + "reward": 0.5546875, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.0003851866349577904, + "sampling/sampling_logp_difference/max": 7.861782550811768, + "sampling/sampling_logp_difference/mean": 0.01929781585931778, + "step": 291 + }, + { + "clip_ratio/high_max": 1.996871560550062e-05, + "clip_ratio/high_mean": 6.089093403716106e-06, + "clip_ratio/low_mean": 4.2792244585143635e-05, + "clip_ratio/low_min": 1.0337215371691855e-05, + "clip_ratio/region_mean": 4.8881338216233416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7322.5078125, + "completions/mean_terminated_length": 6876.8603515625, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 0.9157031401991844, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036942458245903254, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 252977435.0, + "reward": 0.3359375, + "reward_std": 0.24275577068328857, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.00029605376766994596, + "sampling/sampling_logp_difference/max": 8.124969482421875, + "sampling/sampling_logp_difference/mean": 0.0205365102738142, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.631919460327481e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.631919460327481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16078.0, + "completions/mean_length": 7025.484375, + "completions/mean_terminated_length": 6723.5966796875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 1.1329731941223145, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034127074759453535, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 253896161.0, + "reward": 0.25, + "reward_std": 0.27722424268722534, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0005197672289796174, + "sampling/sampling_logp_difference/max": 7.562129497528076, + "sampling/sampling_logp_difference/mean": 0.023741140961647034, + "step": 293 + }, + { + "clip_ratio/high_max": 4.368643658381188e-06, + "clip_ratio/high_mean": 1.092160914595297e-06, + "clip_ratio/low_mean": 2.4661783299961826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5753944555617636e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13776.0, + "completions/mean_length": 5996.1796875, + "completions/mean_terminated_length": 5661.08837890625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8773328885436058, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003959407564252615, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 254690264.0, + "reward": 0.53125, + "reward_std": 0.26645541191101074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07, + "sampling/sampling_logp_difference/max": 15.73043155670166, + "sampling/sampling_logp_difference/mean": 0.018407585099339485, + "step": 294 + }, + { + "clip_ratio/high_max": 1.616483677935321e-05, + "clip_ratio/high_mean": 4.041209194838302e-06, + "clip_ratio/low_mean": 3.736187466074625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140308453770558e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7165.328125, + "completions/mean_terminated_length": 6867.951171875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.9502597972750664, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030910037457942963, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 255626394.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000731945037842, + "sampling/importance_sampling_ratio/min": 0.00022311302018351853, + "sampling/sampling_logp_difference/max": 8.407832145690918, + "sampling/sampling_logp_difference/mean": 0.020668907091021538, + "step": 295 + }, + { + "clip_ratio/high_max": 1.1702686606440693e-05, + "clip_ratio/high_mean": 2.9256716516101733e-06, + "clip_ratio/low_mean": 5.5247357522603124e-05, + "clip_ratio/low_min": 3.6811261452385224e-06, + "clip_ratio/region_mean": 5.8173028264718596e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15375.0, + "completions/mean_length": 8001.9296875, + "completions/mean_terminated_length": 7661.34912109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8591345250606537, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037233952898532152, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 256673457.0, + "reward": 0.421875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.0021876997780054808, + "sampling/sampling_logp_difference/max": 6.124904632568359, + "sampling/sampling_logp_difference/mean": 0.020540472120046616, + "step": 296 + }, + { + "clip_ratio/high_max": 3.721341136042611e-05, + "clip_ratio/high_mean": 1.2759249216287571e-05, + "clip_ratio/low_mean": 3.570647322703735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.846572301175911e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 6924.84375, + "completions/mean_terminated_length": 6697.82421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.7969356626272202, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006054217461496592, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 257578501.0, + "reward": 0.5078125, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.007889713160693645, + "sampling/sampling_logp_difference/max": 4.842195510864258, + "sampling/sampling_logp_difference/mean": 0.019306108355522156, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0211543894911301e-05, + "clip_ratio/high_mean": 2.5528859737278253e-06, + "clip_ratio/low_mean": 5.2388056587915344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4940942732173426e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14439.0, + "completions/mean_length": 6203.03125, + "completions/mean_terminated_length": 5958.6884765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.8734413683414459, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004903806839138269, + "learning_rate": 1e-05, + "loss": 0.0689, + "num_tokens": 258392625.0, + "reward": 0.4453125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 0.00020370795391499996, + "sampling/sampling_logp_difference/max": 8.498823165893555, + "sampling/sampling_logp_difference/mean": 0.01909301057457924, + "step": 298 + }, + { + "clip_ratio/high_max": 1.5135058674786706e-05, + "clip_ratio/high_mean": 4.64845766146027e-06, + "clip_ratio/low_mean": 4.373456977191381e-05, + "clip_ratio/low_min": 3.670856358439778e-06, + "clip_ratio/region_mean": 4.8383026296505705e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 7982.5390625, + "completions/mean_terminated_length": 7641.01611328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0091779381036758, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033637424930930138, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 259435270.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999765753746033, + "sampling/importance_sampling_ratio/min": 0.0016514655435457826, + "sampling/sampling_logp_difference/max": 6.406092166900635, + "sampling/sampling_logp_difference/mean": 0.02182736061513424, + "step": 299 + }, + { + "clip_ratio/high_max": 2.3964702677403693e-05, + "clip_ratio/high_mean": 5.991175669350923e-06, + "clip_ratio/low_mean": 5.2442986770984135e-05, + "clip_ratio/low_min": 8.75736759553547e-06, + "clip_ratio/region_mean": 5.843416238349164e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6915.3125, + "completions/mean_terminated_length": 6688.064453125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.7964543774724007, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052203768864274025, + "learning_rate": 1e-05, + "loss": 0.144, + "num_tokens": 260337614.0, + "reward": 0.46875, + "reward_std": 0.37928223609924316, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 7.032832218101248e-05, + "sampling/sampling_logp_difference/max": 9.562335968017578, + "sampling/sampling_logp_difference/mean": 0.017896221950650215, + "step": 300 + }, + { + "clip_ratio/high_max": 4.458271632756805e-05, + "clip_ratio/high_mean": 1.1145679081892013e-05, + "clip_ratio/low_mean": 6.243192206056847e-05, + "clip_ratio/low_min": 1.2397775662975619e-05, + "clip_ratio/region_mean": 7.357759886872373e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7029.4375, + "completions/mean_terminated_length": 6880.95263671875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.8605096861720085, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005570738110691309, + "learning_rate": 1e-05, + "loss": 0.0984, + "num_tokens": 261254070.0, + "reward": 0.4765625, + "reward_std": 0.3327290117740631, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999494552612305, + "sampling/importance_sampling_ratio/min": 0.0009070249507203698, + "sampling/sampling_logp_difference/max": 7.005340576171875, + "sampling/sampling_logp_difference/mean": 0.01905740052461624, + "step": 301 + }, + { + "clip_ratio/high_max": 3.390461233720998e-05, + "clip_ratio/high_mean": 1.1191766247975465e-05, + "clip_ratio/low_mean": 7.46641262594494e-05, + "clip_ratio/low_min": 5.041745680500753e-06, + "clip_ratio/region_mean": 8.585589102949598e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5858.84375, + "completions/mean_terminated_length": 5606.240234375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8430554121732712, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004496110137552023, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 262024906.0, + "reward": 0.4453125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294877052307, + "sampling/importance_sampling_ratio/min": 0.00040469475788995624, + "sampling/sampling_logp_difference/max": 7.812377452850342, + "sampling/sampling_logp_difference/mean": 0.019225869327783585, + "step": 302 + }, + { + "clip_ratio/high_max": 3.2563955301156966e-06, + "clip_ratio/high_mean": 8.140988825289242e-07, + "clip_ratio/low_mean": 3.7080020149460324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.789411886145899e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15976.0, + "completions/mean_length": 8337.328125, + "completions/mean_terminated_length": 7728.7568359375, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.901745393872261, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00348713924176991, + "learning_rate": 1e-05, + "loss": -0.0002, + "num_tokens": 263110844.0, + "reward": 0.296875, + "reward_std": 0.20805485546588898, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.0022652465850114822, + "sampling/sampling_logp_difference/max": 6.090071678161621, + "sampling/sampling_logp_difference/mean": 0.02157524600625038, + "step": 303 + }, + { + "clip_ratio/high_max": 2.3739744847262045e-05, + "clip_ratio/high_mean": 5.934936211815511e-06, + "clip_ratio/low_mean": 2.823553325015382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.417046866616147e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7084.7265625, + "completions/mean_terminated_length": 6381.42041015625, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8265534415841103, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003980033565312624, + "learning_rate": 1e-05, + "loss": 0.0551, + "num_tokens": 264036169.0, + "reward": 0.3984375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673366546631, + "sampling/importance_sampling_ratio/min": 0.00012345099821686745, + "sampling/sampling_logp_difference/max": 8.999666213989258, + "sampling/sampling_logp_difference/mean": 0.018782664090394974, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1745505617000163e-05, + "clip_ratio/high_mean": 3.771558226617344e-06, + "clip_ratio/low_mean": 6.913120819262986e-05, + "clip_ratio/low_min": 2.494283216947224e-05, + "clip_ratio/region_mean": 7.290276607818669e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6543.796875, + "completions/mean_terminated_length": 6543.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8899869695305824, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.006467343773692846, + "learning_rate": 1e-05, + "loss": 0.1139, + "num_tokens": 264892767.0, + "reward": 0.484375, + "reward_std": 0.3934885561466217, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000489950180054, + "sampling/importance_sampling_ratio/min": 9.891482477542013e-05, + "sampling/sampling_logp_difference/max": 9.221251487731934, + "sampling/sampling_logp_difference/mean": 0.02032080665230751, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.395576979732141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.395576979732141e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16307.0, + "completions/mean_length": 8483.390625, + "completions/mean_terminated_length": 7813.84765625, + "completions/min_length": 1342.0, + "completions/min_terminated_length": 1342.0, + "entropy": 0.9621479511260986, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003174177836626768, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 265995697.0, + "reward": 0.3359375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.0005628522485494614, + "sampling/sampling_logp_difference/max": 7.4824934005737305, + "sampling/sampling_logp_difference/mean": 0.02145479805767536, + "step": 306 + }, + { + "clip_ratio/high_max": 1.2596524811669951e-05, + "clip_ratio/high_mean": 3.149131202917488e-06, + "clip_ratio/low_mean": 3.7911659774181317e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.106079018129094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14985.0, + "completions/mean_length": 7184.578125, + "completions/mean_terminated_length": 6963.79248046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9993807673454285, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003356153378263116, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 266937707.0, + "reward": 0.3828125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.0017036627978086472, + "sampling/sampling_logp_difference/max": 6.374974727630615, + "sampling/sampling_logp_difference/mean": 0.02204768732190132, + "step": 307 + }, + { + "clip_ratio/high_max": 1.9245163684900035e-05, + "clip_ratio/high_mean": 4.811290921225009e-06, + "clip_ratio/low_mean": 4.8845648166206956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.365693925796222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16216.0, + "completions/mean_length": 7029.2265625, + "completions/mean_terminated_length": 6727.45947265625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.9139953926205635, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006375293247401714, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 267853880.0, + "reward": 0.4765625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.010649868287146091, + "sampling/sampling_logp_difference/max": 4.542207717895508, + "sampling/sampling_logp_difference/mean": 0.020365029573440552, + "step": 308 + }, + { + "clip_ratio/high_max": 4.812504812434781e-06, + "clip_ratio/high_mean": 1.2031262031086953e-06, + "clip_ratio/low_mean": 2.5999243803198624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.720237000630732e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6188.0078125, + "completions/mean_terminated_length": 5943.30419921875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.7640773430466652, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003697809297591448, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 268665721.0, + "reward": 0.5078125, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372363090515, + "sampling/importance_sampling_ratio/min": 0.02927250787615776, + "sampling/sampling_logp_difference/max": 3.531106472015381, + "sampling/sampling_logp_difference/mean": 0.016581017524003983, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1358927824621787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1358927824621787e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 8128.21875, + "completions/mean_terminated_length": 7861.90283203125, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.8218234181404114, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002286596456542611, + "learning_rate": 1e-05, + "loss": 0.0763, + "num_tokens": 269726181.0, + "reward": 0.375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798536300659, + "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06, + "sampling/sampling_logp_difference/max": 12.90043830871582, + "sampling/sampling_logp_difference/mean": 0.019403984770178795, + "step": 310 + }, + { + "clip_ratio/high_max": 1.4808477317274082e-05, + "clip_ratio/high_mean": 3.7021193293185206e-06, + "clip_ratio/low_mean": 3.0363167581981543e-05, + "clip_ratio/low_min": 6.364238288369961e-06, + "clip_ratio/region_mean": 3.4065286854456645e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 5673.3359375, + "completions/mean_terminated_length": 5503.32568359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.9275510385632515, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00485506234690547, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 270470616.0, + "reward": 0.4921875, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.0009123464697040617, + "sampling/sampling_logp_difference/max": 6.999490737915039, + "sampling/sampling_logp_difference/mean": 0.01881871558725834, + "step": 311 + }, + { + "clip_ratio/high_max": 1.1274602456978755e-05, + "clip_ratio/high_mean": 3.6739949109687586e-06, + "clip_ratio/low_mean": 3.968570712231667e-05, + "clip_ratio/low_min": 3.4213767321489286e-06, + "clip_ratio/region_mean": 4.335970191959859e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 6944.8984375, + "completions/mean_terminated_length": 6795.07177734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9335741624236107, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005874342750757933, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 271377723.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000594854354858, + "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05, + "sampling/sampling_logp_difference/max": 10.049861907958984, + "sampling/sampling_logp_difference/mean": 0.020590776577591896, + "step": 312 + }, + { + "clip_ratio/high_max": 1.264126694877632e-05, + "clip_ratio/high_mean": 3.16031673719408e-06, + "clip_ratio/low_mean": 3.206376845810155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.522408474054828e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7705.625, + "completions/mean_terminated_length": 7278.8193359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.8491624072194099, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001684082904830575, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 272384891.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 6.605865200981498e-05, + "sampling/sampling_logp_difference/max": 9.624967575073242, + "sampling/sampling_logp_difference/mean": 0.020136822015047073, + "step": 313 + }, + { + "clip_ratio/high_max": 9.772357770998497e-06, + "clip_ratio/high_mean": 2.443089442749624e-06, + "clip_ratio/low_mean": 3.8573590472879005e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101667946088128e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6611.1484375, + "completions/mean_terminated_length": 6534.19677734375, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "entropy": 0.8867302760481834, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003692191792652011, + "learning_rate": 1e-05, + "loss": 0.1233, + "num_tokens": 273251630.0, + "reward": 0.3984375, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.0031062732450664043, + "sampling/sampling_logp_difference/max": 5.774331569671631, + "sampling/sampling_logp_difference/mean": 0.019237037748098373, + "step": 314 + }, + { + "clip_ratio/high_max": 3.0103737344688852e-05, + "clip_ratio/high_mean": 9.664363972206047e-06, + "clip_ratio/low_mean": 1.7575501146893657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.723986426644842e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15786.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 6770.46875, + "completions/mean_terminated_length": 6770.46875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.8252957463264465, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004167635925114155, + "learning_rate": 1e-05, + "loss": -0.0072, + "num_tokens": 274146482.0, + "reward": 0.5703125, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.00010247006866848096, + "sampling/sampling_logp_difference/max": 9.18593978881836, + "sampling/sampling_logp_difference/mean": 0.019684650003910065, + "step": 315 + }, + { + "clip_ratio/high_max": 6.529460733872838e-06, + "clip_ratio/high_mean": 1.6323651834682096e-06, + "clip_ratio/low_mean": 3.877351048231503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.040587566578324e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15827.0, + "completions/mean_length": 8210.859375, + "completions/mean_terminated_length": 7365.36181640625, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.8118235394358635, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030363225378096104, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 275214040.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998943209648132, + "sampling/importance_sampling_ratio/min": 0.002854935359209776, + "sampling/sampling_logp_difference/max": 5.858705997467041, + "sampling/sampling_logp_difference/mean": 0.019275270402431488, + "step": 316 + }, + { + "clip_ratio/high_max": 7.0800629146106075e-06, + "clip_ratio/high_mean": 1.7700157286526519e-06, + "clip_ratio/low_mean": 2.3981688286767167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5751703674359305e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14900.0, + "completions/mean_length": 7072.8828125, + "completions/mean_terminated_length": 6849.41650390625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8018335327506065, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004777858033776283, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 276138049.0, + "reward": 0.453125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 0.0028502768836915493, + "sampling/sampling_logp_difference/max": 5.860339164733887, + "sampling/sampling_logp_difference/mean": 0.01849908009171486, + "step": 317 + }, + { + "clip_ratio/high_max": 2.259368602608447e-05, + "clip_ratio/high_mean": 5.648421506521117e-06, + "clip_ratio/low_mean": 4.28424866640853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.849090737479855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14447.0, + "completions/mean_length": 5889.8359375, + "completions/mean_terminated_length": 5723.26220703125, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.7976400703191757, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030593445990234613, + "learning_rate": 1e-05, + "loss": 0.1331, + "num_tokens": 276910124.0, + "reward": 0.5859375, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.000139843366923742, + "sampling/sampling_logp_difference/max": 8.874987602233887, + "sampling/sampling_logp_difference/mean": 0.01834402233362198, + "step": 318 + }, + { + "clip_ratio/high_max": 1.4654247024736833e-05, + "clip_ratio/high_mean": 3.663561756184208e-06, + "clip_ratio/low_mean": 2.377464920755301e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7438210736363544e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 7144.265625, + "completions/mean_terminated_length": 6689.85205078125, + "completions/min_length": 1200.0, + "completions/min_terminated_length": 1200.0, + "entropy": 0.8309404999017715, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004245694726705551, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 277843542.0, + "reward": 0.4453125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998534321784973, + "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05, + "sampling/sampling_logp_difference/max": 11.499897956848145, + "sampling/sampling_logp_difference/mean": 0.01875344291329384, + "step": 319 + }, + { + "clip_ratio/high_max": 6.252500952541595e-06, + "clip_ratio/high_mean": 2.241558604509919e-06, + "clip_ratio/low_mean": 4.735765514851664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9599213525652885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15722.0, + "completions/mean_length": 6779.5234375, + "completions/mean_terminated_length": 6703.8974609375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.9584890529513359, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035574575886130333, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 278730129.0, + "reward": 0.3984375, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.005792221520096064, + "sampling/sampling_logp_difference/max": 5.151239395141602, + "sampling/sampling_logp_difference/mean": 0.02137477695941925, + "step": 320 + }, + { + "clip_ratio/high_max": 3.2948471016425174e-05, + "clip_ratio/high_mean": 9.518853403278627e-06, + "clip_ratio/low_mean": 2.195712454522436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.14759782895635e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15892.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 5582.9765625, + "completions/mean_terminated_length": 5582.9765625, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8629376217722893, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037982752546668053, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 279462542.0, + "reward": 0.5546875, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999780058860779, + "sampling/importance_sampling_ratio/min": 0.0021874974481761456, + "sampling/sampling_logp_difference/max": 6.124997138977051, + "sampling/sampling_logp_difference/mean": 0.01906203106045723, + "step": 321 + }, + { + "clip_ratio/high_max": 1.1029473625967512e-05, + "clip_ratio/high_mean": 2.757368406491878e-06, + "clip_ratio/low_mean": 5.367386921761863e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6431237737797346e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 6942.2578125, + "completions/mean_terminated_length": 6477.90966796875, + "completions/min_length": 1156.0, + "completions/min_terminated_length": 1156.0, + "entropy": 0.8147861957550049, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027678858023136854, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 280370207.0, + "reward": 0.4375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998471736907959, + "sampling/importance_sampling_ratio/min": 0.00023058800434228033, + "sampling/sampling_logp_difference/max": 8.3748779296875, + "sampling/sampling_logp_difference/mean": 0.01940828748047352, + "step": 322 + }, + { + "clip_ratio/high_max": 2.6367894406575942e-05, + "clip_ratio/high_mean": 8.765707434577052e-06, + "clip_ratio/low_mean": 3.232976985145797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.109547796815605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6242.53125, + "completions/mean_terminated_length": 5915.38671875, + "completions/min_length": 1220.0, + "completions/min_terminated_length": 1220.0, + "entropy": 0.878915011882782, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00577945914119482, + "learning_rate": 1e-05, + "loss": 0.0839, + "num_tokens": 281189491.0, + "reward": 0.515625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 9.611724817659706e-05, + "sampling/sampling_logp_difference/max": 9.2499418258667, + "sampling/sampling_logp_difference/mean": 0.01948760263621807, + "step": 323 + }, + { + "clip_ratio/high_max": 3.50839609382092e-05, + "clip_ratio/high_mean": 1.1664920634757436e-05, + "clip_ratio/low_mean": 1.833109013205103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9996010880495305e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 7004.015625, + "completions/mean_terminated_length": 6622.71533203125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.7964659407734871, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014128695474937558, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 282103997.0, + "reward": 0.4140625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.0024504722096025944, + "sampling/sampling_logp_difference/max": 6.011474609375, + "sampling/sampling_logp_difference/mean": 0.019019678235054016, + "step": 324 + }, + { + "clip_ratio/high_max": 1.832260545597819e-05, + "clip_ratio/high_mean": 4.580651363994548e-06, + "clip_ratio/low_mean": 5.309064226821647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.767129368905444e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7822.6953125, + "completions/mean_terminated_length": 7546.52392578125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.8571138679981232, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002476039342582226, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 283122382.0, + "reward": 0.4609375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.0009774373611435294, + "sampling/sampling_logp_difference/max": 6.930576324462891, + "sampling/sampling_logp_difference/mean": 0.020557202398777008, + "step": 325 + }, + { + "clip_ratio/high_max": 5.738419986300869e-06, + "clip_ratio/high_mean": 1.4346049965752172e-06, + "clip_ratio/low_mean": 4.19679121819172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3402517292179255e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7738.8984375, + "completions/mean_terminated_length": 6844.57763671875, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "entropy": 0.7839021533727646, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005309853237122297, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 284130081.0, + "reward": 0.5234375, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998971223831177, + "sampling/importance_sampling_ratio/min": 0.0001319014554610476, + "sampling/sampling_logp_difference/max": 8.933455467224121, + "sampling/sampling_logp_difference/mean": 0.01873316988348961, + "step": 326 + }, + { + "clip_ratio/high_max": 1.007085802484653e-05, + "clip_ratio/high_mean": 2.5177145062116324e-06, + "clip_ratio/low_mean": 4.043528815600439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.295300277590286e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15952.0, + "completions/mean_length": 7102.2421875, + "completions/mean_terminated_length": 6954.9130859375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8530801385641098, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228116944432259, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 285058720.0, + "reward": 0.5078125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00012956927821505815, + "sampling/sampling_logp_difference/max": 8.951294898986816, + "sampling/sampling_logp_difference/mean": 0.019325006753206253, + "step": 327 + }, + { + "clip_ratio/high_max": 4.06874551117653e-06, + "clip_ratio/high_mean": 1.0171863777941326e-06, + "clip_ratio/low_mean": 3.661125703047219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.762844340826632e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15594.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6583.4765625, + "completions/mean_terminated_length": 6583.4765625, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.021921381354332, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004967439454048872, + "learning_rate": 1e-05, + "loss": 0.0374, + "num_tokens": 285919765.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.016675354912877083, + "sampling/sampling_logp_difference/max": 4.093823432922363, + "sampling/sampling_logp_difference/mean": 0.021393200382590294, + "step": 328 + }, + { + "clip_ratio/high_max": 1.2215251445013564e-05, + "clip_ratio/high_mean": 3.053812861253391e-06, + "clip_ratio/low_mean": 4.05305947879242e-05, + "clip_ratio/low_min": 4.215567059873138e-06, + "clip_ratio/region_mean": 4.358440742180392e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16299.0, + "completions/mean_length": 7770.5859375, + "completions/mean_terminated_length": 7346.97509765625, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "entropy": 1.0466903448104858, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004189736675471067, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 286935512.0, + "reward": 0.3828125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.011683559976518154, + "sampling/sampling_logp_difference/max": 4.449572563171387, + "sampling/sampling_logp_difference/mean": 0.021805983036756516, + "step": 329 + }, + { + "clip_ratio/high_max": 2.0567378214764176e-05, + "clip_ratio/high_mean": 5.141844553691044e-06, + "clip_ratio/low_mean": 1.8177100628236076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3318944840866607e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15758.0, + "completions/mean_length": 5689.2421875, + "completions/mean_terminated_length": 5432.568359375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.7778806164860725, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032866497058421373, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 287681943.0, + "reward": 0.640625, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940812587738, + "sampling/importance_sampling_ratio/min": 0.00038077132194302976, + "sampling/sampling_logp_difference/max": 7.873311519622803, + "sampling/sampling_logp_difference/mean": 0.01789461076259613, + "step": 330 + }, + { + "clip_ratio/high_max": 3.109086901531555e-05, + "clip_ratio/high_mean": 7.772717253828887e-06, + "clip_ratio/low_mean": 3.1423560130861006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919627738468989e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13820.0, + "completions/mean_length": 6288.1875, + "completions/mean_terminated_length": 6127.93701171875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.7709921672940254, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023572889622300863, + "learning_rate": 1e-05, + "loss": 0.0746, + "num_tokens": 288506735.0, + "reward": 0.484375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 0.000430915504693985, + "sampling/sampling_logp_difference/max": 7.749598503112793, + "sampling/sampling_logp_difference/mean": 0.017407266423106194, + "step": 331 + }, + { + "clip_ratio/high_max": 3.4638953366084024e-05, + "clip_ratio/high_mean": 9.51674803673086e-06, + "clip_ratio/low_mean": 6.26047980176736e-05, + "clip_ratio/low_min": 5.51267930859467e-06, + "clip_ratio/region_mean": 7.212154741864651e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 6775.0234375, + "completions/mean_terminated_length": 6465.05615234375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9338318258523941, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034220058005303144, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 289395498.0, + "reward": 0.390625, + "reward_std": 0.34533774852752686, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.0317598432302475, + "sampling/sampling_logp_difference/max": 3.449552536010742, + "sampling/sampling_logp_difference/mean": 0.019930530339479446, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.159989991123439e-05, + "clip_ratio/low_min": 1.5592839645250933e-05, + "clip_ratio/region_mean": 7.159989991123439e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 7142.9375, + "completions/mean_terminated_length": 6844.83837890625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.971405878663063, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002513247774913907, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 290329082.0, + "reward": 0.328125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 3.152207455059397e-07, + "sampling/sampling_logp_difference/max": 14.969992637634277, + "sampling/sampling_logp_difference/mean": 0.022366533055901527, + "step": 333 + }, + { + "clip_ratio/high_max": 1.6507752206962323e-05, + "clip_ratio/high_mean": 4.126938051740581e-06, + "clip_ratio/low_mean": 1.7493430505055585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1620368215735652e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15581.0, + "completions/mean_length": 6412.2109375, + "completions/mean_terminated_length": 6333.69287109375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "entropy": 0.9136044681072235, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0056767817586660385, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 291170133.0, + "reward": 0.421875, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999720454216003, + "sampling/importance_sampling_ratio/min": 0.000458698661532253, + "sampling/sampling_logp_difference/max": 7.687117099761963, + "sampling/sampling_logp_difference/mean": 0.020012658089399338, + "step": 334 + }, + { + "clip_ratio/high_max": 8.26085442895419e-06, + "clip_ratio/high_mean": 2.0652136072385474e-06, + "clip_ratio/low_mean": 3.6938338666914206e-05, + "clip_ratio/low_min": 5.699044777429663e-06, + "clip_ratio/region_mean": 3.900355193309224e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16111.0, + "completions/mean_length": 8066.1015625, + "completions/mean_terminated_length": 7797.7822265625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 1.0789504647254944, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00243841833434999, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 292222082.0, + "reward": 0.3046875, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999664425849915, + "sampling/importance_sampling_ratio/min": 8.481895929435268e-05, + "sampling/sampling_logp_difference/max": 9.374991416931152, + "sampling/sampling_logp_difference/mean": 0.023650091141462326, + "step": 335 + }, + { + "clip_ratio/high_max": 5.320054697222076e-06, + "clip_ratio/high_mean": 1.330013674305519e-06, + "clip_ratio/low_mean": 1.9117383317279746e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0447396991585265e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15176.0, + "completions/mean_length": 6836.046875, + "completions/mean_terminated_length": 6606.896484375, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 1.218759760260582, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020856577903032303, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 293115984.0, + "reward": 0.21875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 2.784526441246271e-05, + "sampling/sampling_logp_difference/max": 10.488847732543945, + "sampling/sampling_logp_difference/mean": 0.022012067958712578, + "step": 336 + }, + { + "clip_ratio/high_max": 2.5695502699818462e-05, + "clip_ratio/high_mean": 7.549717793153832e-06, + "clip_ratio/low_mean": 4.6741323160404136e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.429104089671455e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7501.9921875, + "completions/mean_terminated_length": 7140.9345703125, + "completions/min_length": 1237.0, + "completions/min_terminated_length": 1237.0, + "entropy": 0.8940394818782806, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005163854919373989, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 294099503.0, + "reward": 0.328125, + "reward_std": 0.30904707312583923, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999276399612427, + "sampling/importance_sampling_ratio/min": 0.0006545600481331348, + "sampling/sampling_logp_difference/max": 7.331547260284424, + "sampling/sampling_logp_difference/mean": 0.020813245326280594, + "step": 337 + }, + { + "clip_ratio/high_max": 3.1606674838258186e-05, + "clip_ratio/high_mean": 9.45794374729303e-06, + "clip_ratio/low_mean": 4.5567895540443715e-05, + "clip_ratio/low_min": 4.458871444512624e-06, + "clip_ratio/region_mean": 5.502583962879726e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7204.828125, + "completions/mean_terminated_length": 6908.7255859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.9961872175335884, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029277894645929337, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 295042105.0, + "reward": 0.390625, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000677108764648, + "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05, + "sampling/sampling_logp_difference/max": 10.872637748718262, + "sampling/sampling_logp_difference/mean": 0.020187582820653915, + "step": 338 + }, + { + "clip_ratio/high_max": 1.7963964182854397e-05, + "clip_ratio/high_mean": 5.194059781388205e-06, + "clip_ratio/low_mean": 1.8380221035840805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.357428081722901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15856.0, + "completions/mean_length": 6256.859375, + "completions/mean_terminated_length": 6013.80810546875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "entropy": 0.9293600022792816, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032952844630926847, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 295867039.0, + "reward": 0.46875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999649524688721, + "sampling/importance_sampling_ratio/min": 7.995560008566827e-05, + "sampling/sampling_logp_difference/max": 9.434039115905762, + "sampling/sampling_logp_difference/mean": 0.019491540268063545, + "step": 339 + }, + { + "clip_ratio/high_max": 7.577551059512189e-06, + "clip_ratio/high_mean": 1.8943877648780472e-06, + "clip_ratio/low_mean": 2.7479814093567256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9374201631071628e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15412.0, + "completions/mean_length": 7397.84375, + "completions/mean_terminated_length": 7032.552734375, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.8508890569210052, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029417150653898716, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 296832843.0, + "reward": 0.375, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000183582305908, + "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05, + "sampling/sampling_logp_difference/max": 10.93724250793457, + "sampling/sampling_logp_difference/mean": 0.01975393109023571, + "step": 340 + }, + { + "clip_ratio/high_max": 3.281225508544594e-05, + "clip_ratio/high_mean": 1.3302957199812226e-05, + "clip_ratio/low_mean": 5.109179869577929e-05, + "clip_ratio/low_min": 6.657612175331451e-06, + "clip_ratio/region_mean": 6.439475532715733e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6897.765625, + "completions/mean_terminated_length": 6823.07080078125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9046694040298462, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026788609102368355, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 297735285.0, + "reward": 0.421875, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.001710799871943891, + "sampling/sampling_logp_difference/max": 6.370794296264648, + "sampling/sampling_logp_difference/mean": 0.020578179508447647, + "step": 341 + }, + { + "clip_ratio/high_max": 1.7319889593636617e-05, + "clip_ratio/high_mean": 5.168538336874917e-06, + "clip_ratio/low_mean": 7.019768918326008e-05, + "clip_ratio/low_min": 2.541147478041239e-05, + "clip_ratio/region_mean": 7.53662266106403e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 6971.9921875, + "completions/mean_terminated_length": 6509.10595703125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8658201694488525, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005915141198784113, + "learning_rate": 1e-05, + "loss": 0.0923, + "num_tokens": 298645124.0, + "reward": 0.3984375, + "reward_std": 0.3742823898792267, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999268651008606, + "sampling/importance_sampling_ratio/min": 0.000970841443631798, + "sampling/sampling_logp_difference/max": 6.937347412109375, + "sampling/sampling_logp_difference/mean": 0.01906151883304119, + "step": 342 + }, + { + "clip_ratio/high_max": 1.8332865238335216e-05, + "clip_ratio/high_mean": 4.583216309583804e-06, + "clip_ratio/low_mean": 6.167940273371642e-05, + "clip_ratio/low_min": 5.969151516183047e-06, + "clip_ratio/region_mean": 6.626261847486603e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15054.0, + "completions/mean_length": 6545.6953125, + "completions/mean_terminated_length": 5889.80859375, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.779609851539135, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032792428974062204, + "learning_rate": 1e-05, + "loss": 0.097, + "num_tokens": 299503781.0, + "reward": 0.609375, + "reward_std": 0.38293448090553284, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999361634254456, + "sampling/importance_sampling_ratio/min": 0.002187495119869709, + "sampling/sampling_logp_difference/max": 6.124998092651367, + "sampling/sampling_logp_difference/mean": 0.017413027584552765, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.46246323235755e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.46246323235755e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7226.515625, + "completions/mean_terminated_length": 7006.736328125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9573849961161613, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005092279519885778, + "learning_rate": 1e-05, + "loss": 0.1102, + "num_tokens": 300447903.0, + "reward": 0.5390625, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999373555183411, + "sampling/importance_sampling_ratio/min": 0.000627054600045085, + "sampling/sampling_logp_difference/max": 7.374476909637451, + "sampling/sampling_logp_difference/mean": 0.021570835262537003, + "step": 344 + }, + { + "clip_ratio/high_max": 5.487269390869187e-06, + "clip_ratio/high_mean": 1.3718173477172968e-06, + "clip_ratio/low_mean": 4.7280102080549113e-05, + "clip_ratio/low_min": 1.0166083029616857e-05, + "clip_ratio/region_mean": 4.865191931457957e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14967.0, + "completions/mean_length": 5755.171875, + "completions/mean_terminated_length": 5323.10546875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8482184633612633, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005033228080719709, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 301206021.0, + "reward": 0.390625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.0014573346124961972, + "sampling/sampling_logp_difference/max": 6.531146049499512, + "sampling/sampling_logp_difference/mean": 0.018870476633310318, + "step": 345 + }, + { + "clip_ratio/high_max": 5.421346941147931e-06, + "clip_ratio/high_mean": 1.3553367352869827e-06, + "clip_ratio/low_mean": 1.6510994441887306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.786633117717429e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 7098.7265625, + "completions/mean_terminated_length": 6875.88037109375, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "entropy": 0.87320177257061, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007659573573619127, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 302133890.0, + "reward": 0.421875, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0012466582702472806, + "sampling/sampling_logp_difference/max": 6.687288761138916, + "sampling/sampling_logp_difference/mean": 0.019994346424937248, + "step": 346 + }, + { + "clip_ratio/high_max": 1.1556229310372146e-05, + "clip_ratio/high_mean": 2.8890573275930365e-06, + "clip_ratio/low_mean": 3.8744643916288624e-05, + "clip_ratio/low_min": 6.108287834649673e-06, + "clip_ratio/region_mean": 4.1633702039689524e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16139.0, + "completions/mean_length": 6399.96875, + "completions/mean_terminated_length": 6077.90283203125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9481896534562111, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014135175151750445, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 302972566.0, + "reward": 0.4140625, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0025698256213217974, + "sampling/sampling_logp_difference/max": 5.963917255401611, + "sampling/sampling_logp_difference/mean": 0.02073008380830288, + "step": 347 + }, + { + "clip_ratio/high_max": 6.59491388432798e-06, + "clip_ratio/high_mean": 2.545892130001448e-06, + "clip_ratio/low_mean": 4.620846755187813e-05, + "clip_ratio/low_min": 6.243132702365983e-06, + "clip_ratio/region_mean": 4.875435956819274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 7298.078125, + "completions/mean_terminated_length": 7226.53564453125, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "entropy": 0.8719206526875496, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027898226398974657, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 303925976.0, + "reward": 0.484375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.005236432887613773, + "sampling/sampling_logp_difference/max": 5.252114772796631, + "sampling/sampling_logp_difference/mean": 0.020944103598594666, + "step": 348 + }, + { + "clip_ratio/high_max": 1.052124343914329e-05, + "clip_ratio/high_mean": 2.6303108597858227e-06, + "clip_ratio/low_mean": 2.010384196182713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.273415248055244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14980.0, + "completions/mean_length": 5667.0390625, + "completions/mean_terminated_length": 5496.9287109375, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "entropy": 0.8791451379656792, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012764945859089494, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 304675157.0, + "reward": 0.390625, + "reward_std": 0.17965976893901825, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000383853912354, + "sampling/importance_sampling_ratio/min": 5.054428584116977e-06, + "sampling/sampling_logp_difference/max": 12.195245742797852, + "sampling/sampling_logp_difference/mean": 0.018928447738289833, + "step": 349 + }, + { + "clip_ratio/high_max": 9.578045592206763e-06, + "clip_ratio/high_mean": 2.3945113980516908e-06, + "clip_ratio/low_mean": 3.1114799753595435e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350931149270764e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15354.0, + "completions/max_terminated_length": 15354.0, + "completions/mean_length": 5874.4453125, + "completions/mean_terminated_length": 5874.4453125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9577538818120956, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00509974779561162, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 305447038.0, + "reward": 0.515625, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999423027038574, + "sampling/importance_sampling_ratio/min": 0.004791648127138615, + "sampling/sampling_logp_difference/max": 5.340880870819092, + "sampling/sampling_logp_difference/mean": 0.02114470861852169, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0903062275247066e-05, + "clip_ratio/high_mean": 2.7257655688117666e-06, + "clip_ratio/low_mean": 4.784364205079328e-05, + "clip_ratio/low_min": 3.861600362142781e-06, + "clip_ratio/region_mean": 5.056940744907479e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 6197.5703125, + "completions/mean_terminated_length": 6035.88134765625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.8665244281291962, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030849494505673647, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 306258023.0, + "reward": 0.515625, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998056888580322, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.021017421036958694, + "step": 351 + }, + { + "clip_ratio/high_max": 1.4299712574938894e-05, + "clip_ratio/high_mean": 4.3520980170796975e-06, + "clip_ratio/low_mean": 6.213493452378316e-05, + "clip_ratio/low_min": 1.0056635801447555e-05, + "clip_ratio/region_mean": 6.648703174505499e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7522.578125, + "completions/mean_terminated_length": 7381.9208984375, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.8185881152749062, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002946985885500908, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 307240305.0, + "reward": 0.3125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.005127199459820986, + "sampling/sampling_logp_difference/max": 5.273195743560791, + "sampling/sampling_logp_difference/mean": 0.01965932548046112, + "step": 352 + }, + { + "clip_ratio/high_max": 1.693051035545068e-05, + "clip_ratio/high_mean": 5.08456730585749e-06, + "clip_ratio/low_mean": 4.2052345861520735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.713691282631771e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14090.0, + "completions/mean_length": 6403.2265625, + "completions/mean_terminated_length": 6163.6884765625, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "entropy": 0.8359840363264084, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031181599479168653, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 308079318.0, + "reward": 0.5, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999215602874756, + "sampling/importance_sampling_ratio/min": 6.73715621815063e-05, + "sampling/sampling_logp_difference/max": 9.605287551879883, + "sampling/sampling_logp_difference/mean": 0.01963040418922901, + "step": 353 + }, + { + "clip_ratio/high_max": 1.3988919135954347e-05, + "clip_ratio/high_mean": 3.497229783988587e-06, + "clip_ratio/low_mean": 6.722658486069122e-05, + "clip_ratio/low_min": 1.858519090092159e-05, + "clip_ratio/region_mean": 7.072381458783639e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7954.03125, + "completions/mean_terminated_length": 7751.71240234375, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.905990719795227, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002656223252415657, + "learning_rate": 1e-05, + "loss": 0.1022, + "num_tokens": 309117770.0, + "reward": 0.3828125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999536275863647, + "sampling/importance_sampling_ratio/min": 0.0003354826185386628, + "sampling/sampling_logp_difference/max": 7.999940395355225, + "sampling/sampling_logp_difference/mean": 0.020741507411003113, + "step": 354 + }, + { + "clip_ratio/high_max": 1.7610595023143105e-05, + "clip_ratio/high_mean": 4.402648755785776e-06, + "clip_ratio/low_mean": 4.337988764291367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.778253651238629e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6630.09375, + "completions/mean_terminated_length": 6315.45166015625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.870736837387085, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0060529084876179695, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 309988894.0, + "reward": 0.515625, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998822212219238, + "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05, + "sampling/sampling_logp_difference/max": 10.716434478759766, + "sampling/sampling_logp_difference/mean": 0.02060208097100258, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0448093235027045e-05, + "clip_ratio/high_mean": 2.6120233087567613e-06, + "clip_ratio/low_mean": 3.1030769946482906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364279325523967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15920.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6679.6171875, + "completions/mean_terminated_length": 6679.6171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9812518879771233, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00400698184967041, + "learning_rate": 1e-05, + "loss": 0.0605, + "num_tokens": 310864013.0, + "reward": 0.421875, + "reward_std": 0.3295465111732483, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999049305915833, + "sampling/importance_sampling_ratio/min": 0.0020593837834894657, + "sampling/sampling_logp_difference/max": 6.1853485107421875, + "sampling/sampling_logp_difference/mean": 0.02098071575164795, + "step": 356 + }, + { + "clip_ratio/high_max": 2.124982574969181e-05, + "clip_ratio/high_mean": 7.736592579021817e-06, + "clip_ratio/low_mean": 2.900951585615985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.674610888992902e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14541.0, + "completions/mean_length": 5523.796875, + "completions/mean_terminated_length": 5173.4677734375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9120645374059677, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005929585546255112, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 311589987.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998446702957153, + "sampling/importance_sampling_ratio/min": 0.0010661041596904397, + "sampling/sampling_logp_difference/max": 6.843744277954102, + "sampling/sampling_logp_difference/mean": 0.019948206841945648, + "step": 357 + }, + { + "clip_ratio/high_max": 2.4486997745043482e-05, + "clip_ratio/high_mean": 8.219769085826556e-06, + "clip_ratio/low_mean": 5.346400575945154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.168377467474784e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15401.0, + "completions/mean_length": 6361.3671875, + "completions/mean_terminated_length": 6282.44873046875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.8044678047299385, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006622390355914831, + "learning_rate": 1e-05, + "loss": 0.1023, + "num_tokens": 312424034.0, + "reward": 0.5078125, + "reward_std": 0.3724474310874939, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.0003157092141918838, + "sampling/sampling_logp_difference/max": 8.060688972473145, + "sampling/sampling_logp_difference/mean": 0.018907658755779266, + "step": 358 + }, + { + "clip_ratio/high_max": 1.0407376748844399e-05, + "clip_ratio/high_mean": 2.6018441872110998e-06, + "clip_ratio/low_mean": 5.925514369664597e-05, + "clip_ratio/low_min": 1.3324347946763737e-05, + "clip_ratio/region_mean": 6.185698703120579e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 7109.0, + "completions/mean_terminated_length": 7035.96826171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9167275875806808, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004639944992959499, + "learning_rate": 1e-05, + "loss": 0.0861, + "num_tokens": 313353346.0, + "reward": 0.4140625, + "reward_std": 0.3826971650123596, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999389052391052, + "sampling/importance_sampling_ratio/min": 0.0019070414127781987, + "sampling/sampling_logp_difference/max": 6.262202262878418, + "sampling/sampling_logp_difference/mean": 0.02155841514468193, + "step": 359 + }, + { + "clip_ratio/high_max": 3.959046694035351e-05, + "clip_ratio/high_mean": 1.0912523691786191e-05, + "clip_ratio/low_mean": 3.3944450819944905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.485697365907981e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6314.2734375, + "completions/mean_terminated_length": 6072.60009765625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.8780038207769394, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007643720600754023, + "learning_rate": 1e-05, + "loss": 0.0873, + "num_tokens": 314180717.0, + "reward": 0.4609375, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999802112579346, + "sampling/importance_sampling_ratio/min": 0.021285315975546837, + "sampling/sampling_logp_difference/max": 3.8497378826141357, + "sampling/sampling_logp_difference/mean": 0.01964358240365982, + "step": 360 + }, + { + "clip_ratio/high_max": 3.065382111344661e-05, + "clip_ratio/high_mean": 9.187473835936544e-06, + "clip_ratio/low_mean": 4.137891801292426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.056639065514901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6718.2265625, + "completions/mean_terminated_length": 6486.24853515625, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8326799497008324, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050973957404494286, + "learning_rate": 1e-05, + "loss": 0.0109, + "num_tokens": 315060842.0, + "reward": 0.5078125, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.0009130688849836588, + "sampling/sampling_logp_difference/max": 6.998699188232422, + "sampling/sampling_logp_difference/mean": 0.019501537084579468, + "step": 361 + }, + { + "clip_ratio/high_max": 8.624853762739804e-06, + "clip_ratio/high_mean": 2.156213440684951e-06, + "clip_ratio/low_mean": 1.8797969062234188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0954182048171788e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 8666.8359375, + "completions/mean_terminated_length": 7941.291015625, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.9526705741882324, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019092690199613571, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 316190325.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999814629554749, + "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05, + "sampling/sampling_logp_difference/max": 10.249995231628418, + "sampling/sampling_logp_difference/mean": 0.02051631174981594, + "step": 362 + }, + { + "clip_ratio/high_max": 2.147400391550036e-05, + "clip_ratio/high_mean": 6.434908300434472e-06, + "clip_ratio/low_mean": 3.521234066283796e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.164724816746457e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15164.0, + "completions/mean_length": 7661.8203125, + "completions/mean_terminated_length": 7002.16015625, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.8322782590985298, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019530428107827902, + "learning_rate": 1e-05, + "loss": 0.0729, + "num_tokens": 317191878.0, + "reward": 0.4609375, + "reward_std": 0.21382391452789307, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 8.546619210392237e-05, + "sampling/sampling_logp_difference/max": 9.367389678955078, + "sampling/sampling_logp_difference/mean": 0.019894573837518692, + "step": 363 + }, + { + "clip_ratio/high_max": 1.9436202364886412e-05, + "clip_ratio/high_mean": 6.089704697842535e-06, + "clip_ratio/low_mean": 4.2698405422925134e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.878810955233348e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 7024.859375, + "completions/mean_terminated_length": 6800.240234375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.794853538274765, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031784537713974714, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 318109004.0, + "reward": 0.4921875, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352693557739, + "sampling/importance_sampling_ratio/min": 0.0002962362195830792, + "sampling/sampling_logp_difference/max": 8.124353408813477, + "sampling/sampling_logp_difference/mean": 0.018519200384616852, + "step": 364 + }, + { + "clip_ratio/high_max": 4.127455667912727e-06, + "clip_ratio/high_mean": 1.0318639169781818e-06, + "clip_ratio/low_mean": 4.342453667049995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.445640047379129e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 7282.1796875, + "completions/mean_terminated_length": 6912.1865234375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.904067650437355, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005080109462141991, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 319059075.0, + "reward": 0.4140625, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062108039856, + "sampling/importance_sampling_ratio/min": 0.1194523349404335, + "sampling/sampling_logp_difference/max": 6.136754989624023, + "sampling/sampling_logp_difference/mean": 0.019978653639554977, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.608940076243016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.608940076243016e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15625.0, + "completions/mean_length": 7131.5234375, + "completions/mean_terminated_length": 6596.255859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.8849587142467499, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022667953744530678, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 319990046.0, + "reward": 0.46875, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0370909757912159, + "sampling/sampling_logp_difference/max": 3.294381618499756, + "sampling/sampling_logp_difference/mean": 0.02037571743130684, + "step": 366 + }, + { + "clip_ratio/high_max": 1.5356635913121863e-05, + "clip_ratio/high_mean": 3.839158978280466e-06, + "clip_ratio/low_mean": 3.4950805911648786e-05, + "clip_ratio/low_min": 4.876336333836662e-06, + "clip_ratio/region_mean": 3.8789965287833184e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 6655.4453125, + "completions/mean_terminated_length": 6578.84228515625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.7417122721672058, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00216497085057199, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 320860135.0, + "reward": 0.5625, + "reward_std": 0.3369230031967163, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0005190494703128934, + "sampling/sampling_logp_difference/max": 7.563511371612549, + "sampling/sampling_logp_difference/mean": 0.01771342009305954, + "step": 367 + }, + { + "clip_ratio/high_max": 1.7605634639039636e-05, + "clip_ratio/high_mean": 5.297029474604642e-06, + "clip_ratio/low_mean": 5.688933060810086e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.218636053745286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15849.0, + "completions/mean_length": 7077.1640625, + "completions/mean_terminated_length": 6619.45068359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.8749325424432755, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0028338562697172165, + "learning_rate": 1e-05, + "loss": 0.0643, + "num_tokens": 321783852.0, + "reward": 0.3828125, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998220205307007, + "sampling/importance_sampling_ratio/min": 7.83290306571871e-06, + "sampling/sampling_logp_difference/max": 11.757177352905273, + "sampling/sampling_logp_difference/mean": 0.020299233496189117, + "step": 368 + }, + { + "clip_ratio/high_max": 7.301828190975357e-06, + "clip_ratio/high_mean": 1.8254570477438392e-06, + "clip_ratio/low_mean": 5.158197632226802e-05, + "clip_ratio/low_min": 3.735804057214409e-06, + "clip_ratio/region_mean": 5.340743223314348e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6034.296875, + "completions/mean_terminated_length": 5525.294921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.80014718323946, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022897711023688316, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 322572882.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999347925186157, + "sampling/importance_sampling_ratio/min": 0.0004105660773348063, + "sampling/sampling_logp_difference/max": 7.7979736328125, + "sampling/sampling_logp_difference/mean": 0.01858348958194256, + "step": 369 + }, + { + "clip_ratio/high_max": 9.364057859784225e-06, + "clip_ratio/high_mean": 3.351393047523743e-06, + "clip_ratio/low_mean": 4.186752630630508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5218919240141986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 8172.109375, + "completions/mean_terminated_length": 7838.29248046875, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "entropy": 0.8732693120837212, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003263789461925626, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 323640904.0, + "reward": 0.2890625, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999354481697083, + "sampling/importance_sampling_ratio/min": 9.27252222027164e-06, + "sampling/sampling_logp_difference/max": 11.588455200195312, + "sampling/sampling_logp_difference/mean": 0.0208889190107584, + "step": 370 + }, + { + "clip_ratio/high_max": 2.0998899799451465e-05, + "clip_ratio/high_mean": 6.692962131182867e-06, + "clip_ratio/low_mean": 4.261424010110204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930720297124935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 7699.203125, + "completions/mean_terminated_length": 7419.04833984375, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.8296505436301231, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0042716520838439465, + "learning_rate": 1e-05, + "loss": 0.0937, + "num_tokens": 324643858.0, + "reward": 0.4921875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874234199524, + "sampling/importance_sampling_ratio/min": 0.00022192654432728887, + "sampling/sampling_logp_difference/max": 8.413164138793945, + "sampling/sampling_logp_difference/mean": 0.018926654011011124, + "step": 371 + }, + { + "clip_ratio/high_max": 7.061349151626928e-06, + "clip_ratio/high_mean": 1.765337287906732e-06, + "clip_ratio/low_mean": 4.5005243464402156e-05, + "clip_ratio/low_min": 3.861838649754645e-06, + "clip_ratio/region_mean": 4.6770580411248375e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 7450.1640625, + "completions/mean_terminated_length": 7450.1640625, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 1.0400195196270943, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033558050636202097, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 325617687.0, + "reward": 0.2578125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999459385871887, + "sampling/importance_sampling_ratio/min": 0.039920732378959656, + "sampling/sampling_logp_difference/max": 3.2208595275878906, + "sampling/sampling_logp_difference/mean": 0.02249298244714737, + "step": 372 + }, + { + "clip_ratio/high_max": 1.3147802746971138e-05, + "clip_ratio/high_mean": 3.2869506867427845e-06, + "clip_ratio/low_mean": 2.4451034505545977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7737984851228248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15342.0, + "completions/mean_length": 6799.0703125, + "completions/mean_terminated_length": 6723.5986328125, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9737623482942581, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005797459278255701, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 326508384.0, + "reward": 0.3125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321699142456, + "sampling/importance_sampling_ratio/min": 7.535634836131067e-07, + "sampling/sampling_logp_difference/max": 14.0984525680542, + "sampling/sampling_logp_difference/mean": 0.021543748676776886, + "step": 373 + }, + { + "clip_ratio/high_max": 3.3594023989280686e-06, + "clip_ratio/high_mean": 8.398505997320171e-07, + "clip_ratio/low_mean": 2.3457610382138228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4297460981870245e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 7034.3671875, + "completions/mean_terminated_length": 6654.30078125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8749603256583214, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002258980879560113, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 327426407.0, + "reward": 0.4609375, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.008719252422451973, + "sampling/sampling_logp_difference/max": 4.742221832275391, + "sampling/sampling_logp_difference/mean": 0.01997346058487892, + "step": 374 + }, + { + "clip_ratio/high_max": 2.823375348270929e-05, + "clip_ratio/high_mean": 7.058438370677322e-06, + "clip_ratio/low_mean": 4.9395109726901865e-05, + "clip_ratio/low_min": 1.636556044104509e-05, + "clip_ratio/region_mean": 5.6453548268109444e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15240.0, + "completions/mean_length": 6623.078125, + "completions/mean_terminated_length": 6388.81640625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.858784057199955, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002420129720121622, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 328292985.0, + "reward": 0.4140625, + "reward_std": 0.3077537417411804, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 0.00014900295354891568, + "sampling/sampling_logp_difference/max": 8.811544418334961, + "sampling/sampling_logp_difference/mean": 0.019645996391773224, + "step": 375 + }, + { + "clip_ratio/high_max": 1.8078507309837732e-05, + "clip_ratio/high_mean": 6.468551191574079e-06, + "clip_ratio/low_mean": 4.051302585139638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.698157727034413e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15229.0, + "completions/mean_length": 5902.4765625, + "completions/mean_terminated_length": 5564.36279296875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.904740035533905, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004107976797968149, + "learning_rate": 1e-05, + "loss": 0.0824, + "num_tokens": 329067006.0, + "reward": 0.5546875, + "reward_std": 0.3945493996143341, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05, + "sampling/sampling_logp_difference/max": 11.37439250946045, + "sampling/sampling_logp_difference/mean": 0.019582755863666534, + "step": 376 + }, + { + "clip_ratio/high_max": 2.553658168835682e-05, + "clip_ratio/high_mean": 7.276365181496658e-06, + "clip_ratio/low_mean": 1.7552573126522475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.482893796695862e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6425.6015625, + "completions/mean_terminated_length": 6267.5322265625, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.964553713798523, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003208522219210863, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 329910691.0, + "reward": 0.359375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999419450759888, + "sampling/importance_sampling_ratio/min": 0.00137569778598845, + "sampling/sampling_logp_difference/max": 6.588794231414795, + "sampling/sampling_logp_difference/mean": 0.021154657006263733, + "step": 377 + }, + { + "clip_ratio/high_max": 6.8712420215888415e-06, + "clip_ratio/high_mean": 1.7178105053972104e-06, + "clip_ratio/low_mean": 4.0991827404468495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2709637853022286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 8006.4453125, + "completions/mean_terminated_length": 7594.43408203125, + "completions/min_length": 1235.0, + "completions/min_terminated_length": 1235.0, + "entropy": 0.8980336412787437, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002898421371355653, + "learning_rate": 1e-05, + "loss": 0.0815, + "num_tokens": 330956332.0, + "reward": 0.4296875, + "reward_std": 0.20175684988498688, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 9.378339746035635e-05, + "sampling/sampling_logp_difference/max": 9.27452278137207, + "sampling/sampling_logp_difference/mean": 0.021021340042352676, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2689344689297286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2689344689297286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15484.0, + "completions/max_terminated_length": 15484.0, + "completions/mean_length": 7068.828125, + "completions/mean_terminated_length": 7068.828125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.9865007549524307, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0037063576746731997, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 331880918.0, + "reward": 0.3203125, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0001819290773710236, + "sampling/sampling_logp_difference/max": 8.611893653869629, + "sampling/sampling_logp_difference/mean": 0.02072504535317421, + "step": 379 + }, + { + "clip_ratio/high_max": 5.845633268108941e-06, + "clip_ratio/high_mean": 1.4614083170272352e-06, + "clip_ratio/low_mean": 3.207486906831036e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353627721480734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 7379.390625, + "completions/mean_terminated_length": 7236.4609375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.8977236375212669, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001972826896235347, + "learning_rate": 1e-05, + "loss": 0.0228, + "num_tokens": 332849112.0, + "reward": 0.4140625, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 2.820451663865242e-05, + "sampling/sampling_logp_difference/max": 10.476028442382812, + "sampling/sampling_logp_difference/mean": 0.019411223009228706, + "step": 380 + }, + { + "clip_ratio/high_max": 4.875385002378607e-06, + "clip_ratio/high_mean": 1.2188462505946518e-06, + "clip_ratio/low_mean": 2.3530714997832547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.47495612484272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15517.0, + "completions/mean_length": 6867.9609375, + "completions/mean_terminated_length": 6793.03125, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "entropy": 0.9244343340396881, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.006926023401319981, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333746179.0, + "reward": 0.4140625, + "reward_std": 0.1433562934398651, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.0003875594411510974, + "sampling/sampling_logp_difference/max": 7.8556413650512695, + "sampling/sampling_logp_difference/mean": 0.020311862230300903, + "step": 381 + }, + { + "clip_ratio/high_max": 1.5651628245905158e-05, + "clip_ratio/high_mean": 4.836261211949022e-06, + "clip_ratio/low_mean": 5.268017821435933e-05, + "clip_ratio/low_min": 3.950945028918795e-06, + "clip_ratio/region_mean": 5.751643902840442e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 7525.375, + "completions/mean_terminated_length": 6855.3955078125, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9207312315702438, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0047226278111338615, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 334731027.0, + "reward": 0.3359375, + "reward_std": 0.3353874683380127, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999615550041199, + "sampling/importance_sampling_ratio/min": 0.00029753465787507594, + "sampling/sampling_logp_difference/max": 8.119979858398438, + "sampling/sampling_logp_difference/mean": 0.021496692672371864, + "step": 382 + }, + { + "clip_ratio/high_max": 3.815379886873416e-05, + "clip_ratio/high_mean": 9.53844971718354e-06, + "clip_ratio/low_mean": 4.519663821156428e-05, + "clip_ratio/low_min": 2.775434040813707e-06, + "clip_ratio/region_mean": 5.473508826980833e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16251.0, + "completions/mean_length": 6841.0625, + "completions/mean_terminated_length": 6453.13818359375, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.8979457840323448, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004971448332071304, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 335631243.0, + "reward": 0.390625, + "reward_std": 0.2596156895160675, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999934196472168, + "sampling/importance_sampling_ratio/min": 9.655764188210014e-06, + "sampling/sampling_logp_difference/max": 11.547955513000488, + "sampling/sampling_logp_difference/mean": 0.020256079733371735, + "step": 383 + }, + { + "clip_ratio/high_max": 4.162365712545579e-06, + "clip_ratio/high_mean": 1.0405914281363948e-06, + "clip_ratio/low_mean": 3.1563491688757495e-05, + "clip_ratio/low_min": 3.1228139505401487e-06, + "clip_ratio/region_mean": 3.260408311689389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15060.0, + "completions/mean_length": 6919.8046875, + "completions/mean_terminated_length": 6454.35205078125, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9241961911320686, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038604787550866604, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 336537162.0, + "reward": 0.375, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998080730438232, + "sampling/importance_sampling_ratio/min": 0.0009118975722230971, + "sampling/sampling_logp_difference/max": 6.999982833862305, + "sampling/sampling_logp_difference/mean": 0.02030865103006363, + "step": 384 + }, + { + "clip_ratio/high_max": 6.5182248363271356e-06, + "clip_ratio/high_mean": 1.6295562090817839e-06, + "clip_ratio/low_mean": 4.3847362121596234e-05, + "clip_ratio/low_min": 6.294533704931382e-06, + "clip_ratio/region_mean": 4.547691833067802e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15692.0, + "completions/mean_length": 7679.390625, + "completions/mean_terminated_length": 7099.08349609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 1.0165777206420898, + "epoch": 0.35418583256669733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004624314606189728, + "learning_rate": 1e-05, + "loss": 0.0849, + "num_tokens": 337542492.0, + "reward": 0.3046875, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999251961708069, + "sampling/importance_sampling_ratio/min": 5.83546279813163e-05, + "sampling/sampling_logp_difference/max": 9.748971939086914, + "sampling/sampling_logp_difference/mean": 0.02206476218998432, + "step": 385 + }, + { + "clip_ratio/high_max": 6.00499606662197e-06, + "clip_ratio/high_mean": 1.5012490166554926e-06, + "clip_ratio/low_mean": 3.392923713363416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.543048615028965e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 5957.5859375, + "completions/mean_terminated_length": 5792.08740234375, + "completions/min_length": 1705.0, + "completions/min_terminated_length": 1705.0, + "entropy": 0.7705951780080795, + "epoch": 0.35510579576816925, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021966886706650257, + "learning_rate": 1e-05, + "loss": 0.0789, + "num_tokens": 338324279.0, + "reward": 0.53125, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999998927116394, + "sampling/importance_sampling_ratio/min": 0.0008041196851991117, + "sampling/sampling_logp_difference/max": 7.125762462615967, + "sampling/sampling_logp_difference/mean": 0.01804077997803688, + "step": 386 + }, + { + "clip_ratio/high_max": 1.5711350215497077e-05, + "clip_ratio/high_mean": 3.927837553874269e-06, + "clip_ratio/low_mean": 5.276240381135722e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.669024130838807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7269.8046875, + "completions/mean_terminated_length": 7198.03955078125, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 1.0025205165147781, + "epoch": 0.3560257589696412, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001694107661023736, + "learning_rate": 1e-05, + "loss": 0.134, + "num_tokens": 339274662.0, + "reward": 0.3359375, + "reward_std": 0.30487072467803955, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039769172668, + "sampling/importance_sampling_ratio/min": 0.0015677008777856827, + "sampling/sampling_logp_difference/max": 6.4581451416015625, + "sampling/sampling_logp_difference/mean": 0.021742526441812515, + "step": 387 + }, + { + "clip_ratio/high_max": 7.005848829066963e-06, + "clip_ratio/high_mean": 1.7514622072667407e-06, + "clip_ratio/low_mean": 5.100632029098051e-05, + "clip_ratio/low_min": 8.934973720897688e-06, + "clip_ratio/region_mean": 5.275778244140383e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7643.8359375, + "completions/mean_terminated_length": 7288.54443359375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "entropy": 0.7936615869402885, + "epoch": 0.35694572217111314, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004587972536683083, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 340272689.0, + "reward": 0.5078125, + "reward_std": 0.35324612259864807, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999613761901855, + "sampling/importance_sampling_ratio/min": 0.0007390327518805861, + "sampling/sampling_logp_difference/max": 7.210168361663818, + "sampling/sampling_logp_difference/mean": 0.01862112432718277, + "step": 388 + }, + { + "clip_ratio/high_max": 1.0522736374696251e-05, + "clip_ratio/high_mean": 2.6306840936740628e-06, + "clip_ratio/low_mean": 2.139122614153166e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4021910121518886e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14401.0, + "completions/mean_length": 7068.734375, + "completions/mean_terminated_length": 6610.60595703125, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "entropy": 0.8858344480395317, + "epoch": 0.3578656853725851, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00245783943682909, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 341195599.0, + "reward": 0.4609375, + "reward_std": 0.21594557166099548, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957263469696, + "sampling/importance_sampling_ratio/min": 1.526316918898374e-05, + "sampling/sampling_logp_difference/max": 11.090067863464355, + "sampling/sampling_logp_difference/mean": 0.019989900290966034, + "step": 389 + }, + { + "clip_ratio/high_max": 5.272259386401856e-06, + "clip_ratio/high_mean": 1.318064846600464e-06, + "clip_ratio/low_mean": 2.2939096254503966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4257160987417592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15788.0, + "completions/mean_length": 6093.296875, + "completions/mean_terminated_length": 5929.95263671875, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.9640207663178444, + "epoch": 0.35878564857405704, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0067657483741641045, + "learning_rate": 1e-05, + "loss": 0.0181, + "num_tokens": 341993565.0, + "reward": 0.4453125, + "reward_std": 0.12415502220392227, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998992681503296, + "sampling/importance_sampling_ratio/min": 0.010459281504154205, + "sampling/sampling_logp_difference/max": 4.56026554107666, + "sampling/sampling_logp_difference/mean": 0.02037961222231388, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.566248594528588e-05, + "clip_ratio/low_min": 4.402028480399167e-06, + "clip_ratio/region_mean": 4.566248594528588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16170.0, + "completions/max_terminated_length": 16170.0, + "completions/mean_length": 7620.09375, + "completions/mean_terminated_length": 7620.09375, + "completions/min_length": 1076.0, + "completions/min_terminated_length": 1076.0, + "entropy": 0.9773544892668724, + "epoch": 0.35970561177552896, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018817185191437602, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 342990545.0, + "reward": 0.3046875, + "reward_std": 0.18755048513412476, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0006883936002850533, + "sampling/sampling_logp_difference/max": 7.281149864196777, + "sampling/sampling_logp_difference/mean": 0.021528441458940506, + "step": 391 + }, + { + "clip_ratio/high_max": 2.6727505428425502e-05, + "clip_ratio/high_mean": 7.985045499481203e-06, + "clip_ratio/low_mean": 7.762144696243922e-05, + "clip_ratio/low_min": 2.4772080450929934e-05, + "clip_ratio/region_mean": 8.560649303035461e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15053.0, + "completions/mean_length": 6963.984375, + "completions/mean_terminated_length": 6737.904296875, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.9683744385838509, + "epoch": 0.36062557497700093, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052104732021689415, + "learning_rate": 1e-05, + "loss": 0.087, + "num_tokens": 343898791.0, + "reward": 0.4296875, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324679374695, + "sampling/importance_sampling_ratio/min": 0.010815954767167568, + "sampling/sampling_logp_difference/max": 4.526732921600342, + "sampling/sampling_logp_difference/mean": 0.021434593945741653, + "step": 392 + }, + { + "clip_ratio/high_max": 1.3545108686230378e-05, + "clip_ratio/high_mean": 4.365133804640209e-06, + "clip_ratio/low_mean": 2.5377692509209737e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9742826200163108e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15116.0, + "completions/mean_length": 6718.5078125, + "completions/mean_terminated_length": 6642.4013671875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9043834507465363, + "epoch": 0.36154553817847285, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005151392426341772, + "learning_rate": 1e-05, + "loss": 0.0085, + "num_tokens": 344779672.0, + "reward": 0.4921875, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999840497970581, + "sampling/importance_sampling_ratio/min": 0.0024171893019229174, + "sampling/sampling_logp_difference/max": 6.025149822235107, + "sampling/sampling_logp_difference/mean": 0.0201373603194952, + "step": 393 + }, + { + "clip_ratio/high_max": 1.2263486723895767e-05, + "clip_ratio/high_mean": 3.927679188109323e-06, + "clip_ratio/low_mean": 2.739263118201052e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132031042696326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 7044.640625, + "completions/mean_terminated_length": 6820.49609375, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "entropy": 0.9017335474491119, + "epoch": 0.3624655013799448, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026606651954352856, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 345701722.0, + "reward": 0.3125, + "reward_std": 0.24146249890327454, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05, + "sampling/sampling_logp_difference/max": 10.157968521118164, + "sampling/sampling_logp_difference/mean": 0.01981864869594574, + "step": 394 + }, + { + "clip_ratio/high_max": 1.026556356009678e-05, + "clip_ratio/high_mean": 2.566390890024195e-06, + "clip_ratio/low_mean": 4.819571529424138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0762106297952414e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15476.0, + "completions/mean_length": 6031.875, + "completions/mean_terminated_length": 5950.3623046875, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.8537683561444283, + "epoch": 0.36338546458141674, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003957017324864864, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 346492810.0, + "reward": 0.4296875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999707341194153, + "sampling/importance_sampling_ratio/min": 0.0015133036067709327, + "sampling/sampling_logp_difference/max": 6.493460178375244, + "sampling/sampling_logp_difference/mean": 0.018711457028985023, + "step": 395 + }, + { + "clip_ratio/high_max": 5.870488848813693e-06, + "clip_ratio/high_mean": 1.4676222122034233e-06, + "clip_ratio/low_mean": 3.637038832948747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.783801014378696e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 7429.3515625, + "completions/mean_terminated_length": 6911.31396484375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.8821266070008278, + "epoch": 0.36430542778288866, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002122648525983095, + "learning_rate": 1e-05, + "loss": 0.1257, + "num_tokens": 347462871.0, + "reward": 0.453125, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000076293945312, + "sampling/importance_sampling_ratio/min": 0.00014005196862854064, + "sampling/sampling_logp_difference/max": 8.873497009277344, + "sampling/sampling_logp_difference/mean": 0.01998838409781456, + "step": 396 + }, + { + "clip_ratio/high_max": 1.0663932243915042e-05, + "clip_ratio/high_mean": 2.6659830609787605e-06, + "clip_ratio/low_mean": 6.443337406381033e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.709935701110226e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15761.0, + "completions/mean_length": 7131.7109375, + "completions/mean_terminated_length": 6833.25, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.8575824722647667, + "epoch": 0.36522539098436063, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002546454081311822, + "learning_rate": 1e-05, + "loss": 0.0676, + "num_tokens": 348395842.0, + "reward": 0.4921875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999964714050293, + "sampling/importance_sampling_ratio/min": 0.0002167800412280485, + "sampling/sampling_logp_difference/max": 8.436627388000488, + "sampling/sampling_logp_difference/mean": 0.0193922221660614, + "step": 397 + }, + { + "clip_ratio/high_max": 3.847337666229578e-06, + "clip_ratio/high_mean": 9.618344165573944e-07, + "clip_ratio/low_mean": 3.932982110654848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.029165563679271e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16200.0, + "completions/mean_length": 6858.34375, + "completions/mean_terminated_length": 6707.14306640625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.9539813920855522, + "epoch": 0.36614535418583255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00492837093770504, + "learning_rate": 1e-05, + "loss": 0.0818, + "num_tokens": 349292790.0, + "reward": 0.390625, + "reward_std": 0.1949220597743988, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998850226402283, + "sampling/importance_sampling_ratio/min": 0.0011153683299198747, + "sampling/sampling_logp_difference/max": 6.79857063293457, + "sampling/sampling_logp_difference/mean": 0.020318543538451195, + "step": 398 + }, + { + "clip_ratio/high_max": 1.291372609557584e-05, + "clip_ratio/high_mean": 3.22843152389396e-06, + "clip_ratio/low_mean": 3.8245348378040944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1473780811429606e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15261.0, + "completions/mean_length": 7809.984375, + "completions/mean_terminated_length": 7533.40283203125, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.8353303670883179, + "epoch": 0.3670653173873045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004895905964076519, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 350312556.0, + "reward": 0.3203125, + "reward_std": 0.22567616403102875, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999260306358337, + "sampling/importance_sampling_ratio/min": 0.0008417933131568134, + "sampling/sampling_logp_difference/max": 7.0799760818481445, + "sampling/sampling_logp_difference/mean": 0.018754083663225174, + "step": 399 + }, + { + "clip_ratio/high_max": 1.1250081115576904e-05, + "clip_ratio/high_mean": 3.5690324011738994e-06, + "clip_ratio/low_mean": 3.196108968950284e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.553012152224255e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15057.0, + "completions/mean_length": 7194.9296875, + "completions/mean_terminated_length": 6821.39013671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9744522422552109, + "epoch": 0.36798528058877644, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0032397822942584753, + "learning_rate": 1e-05, + "loss": 0.0402, + "num_tokens": 351252755.0, + "reward": 0.421875, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998766183853149, + "sampling/importance_sampling_ratio/min": 0.00023159870761446655, + "sampling/sampling_logp_difference/max": 8.370504379272461, + "sampling/sampling_logp_difference/mean": 0.02105094864964485, + "step": 400 + }, + { + "clip_ratio/high_max": 6.980455509619787e-06, + "clip_ratio/high_mean": 1.7451138774049468e-06, + "clip_ratio/low_mean": 2.2670621888210007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.441573599298863e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 6836.234375, + "completions/mean_terminated_length": 6607.08837890625, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.9149863049387932, + "epoch": 0.3689052437902484, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031576494220644236, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 352145873.0, + "reward": 0.3671875, + "reward_std": 0.22225630283355713, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266862869263, + "sampling/importance_sampling_ratio/min": 0.0011975533561781049, + "sampling/sampling_logp_difference/max": 6.727474689483643, + "sampling/sampling_logp_difference/mean": 0.020445333793759346, + "step": 401 + }, + { + "clip_ratio/high_max": 2.3557336589874467e-05, + "clip_ratio/high_mean": 5.889334147468617e-06, + "clip_ratio/low_mean": 5.359988131203863e-05, + "clip_ratio/low_min": 1.3856095392839052e-05, + "clip_ratio/region_mean": 5.9489215118446737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 6942.65625, + "completions/mean_terminated_length": 6638.0966796875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.7541583999991417, + "epoch": 0.36982520699172033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003970830701291561, + "learning_rate": 1e-05, + "loss": 0.051, + "num_tokens": 353056405.0, + "reward": 0.453125, + "reward_std": 0.3282659649848938, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 8.399576472584158e-06, + "sampling/sampling_logp_difference/max": 11.687329292297363, + "sampling/sampling_logp_difference/mean": 0.018101349472999573, + "step": 402 + }, + { + "clip_ratio/high_max": 2.6139805413549766e-05, + "clip_ratio/high_mean": 7.517377525800839e-06, + "clip_ratio/low_mean": 1.968103515537223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7198412681173068e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14786.0, + "completions/max_terminated_length": 14786.0, + "completions/mean_length": 6022.1875, + "completions/mean_terminated_length": 6022.1875, + "completions/min_length": 1285.0, + "completions/min_terminated_length": 1285.0, + "entropy": 0.9535745903849602, + "epoch": 0.37074517019319225, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0043656788766384125, + "learning_rate": 1e-05, + "loss": 0.029, + "num_tokens": 353844661.0, + "reward": 0.4140625, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.04981832951307297, + "sampling/sampling_logp_difference/max": 2.9993722438812256, + "sampling/sampling_logp_difference/mean": 0.020655371248722076, + "step": 403 + }, + { + "clip_ratio/high_max": 9.152076700047473e-06, + "clip_ratio/high_mean": 2.9508817647183605e-06, + "clip_ratio/low_mean": 5.21388310517068e-05, + "clip_ratio/low_min": 2.633131089169183e-06, + "clip_ratio/region_mean": 5.508971298695542e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15906.0, + "completions/mean_length": 8068.96875, + "completions/mean_terminated_length": 7869.408203125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.9473539590835571, + "epoch": 0.3716651333946642, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006543307099491358, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 354894689.0, + "reward": 0.2578125, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 6.672408926533535e-05, + "sampling/sampling_logp_difference/max": 9.614944458007812, + "sampling/sampling_logp_difference/mean": 0.021852033212780952, + "step": 404 + }, + { + "clip_ratio/high_max": 2.9619268843816826e-05, + "clip_ratio/high_mean": 7.4048172109542065e-06, + "clip_ratio/low_mean": 5.5152235972855124e-05, + "clip_ratio/low_min": 1.0455875781190116e-05, + "clip_ratio/region_mean": 6.255705375224352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15748.0, + "completions/mean_length": 5960.1875, + "completions/mean_terminated_length": 5878.1103515625, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 0.9564141109585762, + "epoch": 0.37258509659613614, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003351036459207535, + "learning_rate": 1e-05, + "loss": 0.0293, + "num_tokens": 355677273.0, + "reward": 0.46875, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999220371246338, + "sampling/importance_sampling_ratio/min": 0.0012859756825491786, + "sampling/sampling_logp_difference/max": 6.656237602233887, + "sampling/sampling_logp_difference/mean": 0.021779976785182953, + "step": 405 + }, + { + "clip_ratio/high_max": 7.957685966175632e-06, + "clip_ratio/high_mean": 1.989421491543908e-06, + "clip_ratio/low_mean": 3.758041248147492e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.956983414354909e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15669.0, + "completions/mean_length": 7620.21875, + "completions/mean_terminated_length": 7189.212890625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 1.035948596894741, + "epoch": 0.3735050597976081, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031219006050378084, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 356675829.0, + "reward": 0.296875, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001060962677002, + "sampling/importance_sampling_ratio/min": 0.010141897015273571, + "sampling/sampling_logp_difference/max": 4.591080188751221, + "sampling/sampling_logp_difference/mean": 0.021951109170913696, + "step": 406 + }, + { + "clip_ratio/high_max": 2.286768199155631e-05, + "clip_ratio/high_mean": 5.7169204978890775e-06, + "clip_ratio/low_mean": 3.914574369900947e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.486266482217616e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14038.0, + "completions/mean_length": 5806.0234375, + "completions/mean_terminated_length": 5638.119140625, + "completions/min_length": 1319.0, + "completions/min_terminated_length": 1319.0, + "entropy": 0.8977029845118523, + "epoch": 0.37442502299908004, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002810312667861581, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 357438712.0, + "reward": 0.546875, + "reward_std": 0.22832970321178436, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999280571937561, + "sampling/importance_sampling_ratio/min": 0.0011738575994968414, + "sampling/sampling_logp_difference/max": 6.747459888458252, + "sampling/sampling_logp_difference/mean": 0.01965375244617462, + "step": 407 + }, + { + "clip_ratio/high_max": 1.2219379641464911e-05, + "clip_ratio/high_mean": 3.054844910366228e-06, + "clip_ratio/low_mean": 3.186109779562685e-05, + "clip_ratio/low_min": 4.3511558942554984e-06, + "clip_ratio/region_mean": 3.4915943160740426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15705.0, + "completions/max_terminated_length": 15705.0, + "completions/mean_length": 6537.4609375, + "completions/mean_terminated_length": 6537.4609375, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.9577726796269417, + "epoch": 0.37534498620055196, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004516562446951866, + "learning_rate": 1e-05, + "loss": 0.0517, + "num_tokens": 358296731.0, + "reward": 0.3828125, + "reward_std": 0.1830746978521347, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999170303344727, + "sampling/importance_sampling_ratio/min": 2.384942035860149e-06, + "sampling/sampling_logp_difference/max": 12.946335792541504, + "sampling/sampling_logp_difference/mean": 0.021242395043373108, + "step": 408 + }, + { + "clip_ratio/high_max": 1.4422689218918094e-05, + "clip_ratio/high_mean": 3.6056723047295236e-06, + "clip_ratio/low_mean": 3.026239573955536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3868068385345396e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 7896.671875, + "completions/mean_terminated_length": 7622.88671875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.9163230583071709, + "epoch": 0.37626494940202393, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003542230697348714, + "learning_rate": 1e-05, + "loss": 0.05, + "num_tokens": 359327001.0, + "reward": 0.375, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998560547828674, + "sampling/importance_sampling_ratio/min": 0.00010891625424847007, + "sampling/sampling_logp_difference/max": 9.124931335449219, + "sampling/sampling_logp_difference/mean": 0.020085681229829788, + "step": 409 + }, + { + "clip_ratio/high_max": 1.7827243254942005e-05, + "clip_ratio/high_mean": 5.474494003010477e-06, + "clip_ratio/low_mean": 4.2465159026505717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.793965263161226e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15297.0, + "completions/mean_length": 6728.7109375, + "completions/mean_terminated_length": 6652.68505859375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9010183215141296, + "epoch": 0.37718491260349585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0035069347359240055, + "learning_rate": 1e-05, + "loss": 0.0518, + "num_tokens": 360208780.0, + "reward": 0.5390625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999571442604065, + "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05, + "sampling/sampling_logp_difference/max": 11.124998092651367, + "sampling/sampling_logp_difference/mean": 0.021022530272603035, + "step": 410 + }, + { + "clip_ratio/high_max": 1.0376989393989788e-05, + "clip_ratio/high_mean": 2.594247348497447e-06, + "clip_ratio/low_mean": 2.8587513156708155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1181759936771414e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6800.3984375, + "completions/mean_terminated_length": 6491.25, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.8654960840940475, + "epoch": 0.3781048758049678, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033910400234162807, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 361098567.0, + "reward": 0.5625, + "reward_std": 0.2306838035583496, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998576641082764, + "sampling/importance_sampling_ratio/min": 0.001449413481168449, + "sampling/sampling_logp_difference/max": 6.536596298217773, + "sampling/sampling_logp_difference/mean": 0.019660964608192444, + "step": 411 + }, + { + "clip_ratio/high_max": 2.3068858354236e-05, + "clip_ratio/high_mean": 7.792090059410839e-06, + "clip_ratio/low_mean": 5.8515578757578623e-05, + "clip_ratio/low_min": 1.0348648629587842e-05, + "clip_ratio/region_mean": 6.630766870330262e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7103.4453125, + "completions/mean_terminated_length": 6956.13525390625, + "completions/min_length": 1711.0, + "completions/min_terminated_length": 1711.0, + "entropy": 0.8317076042294502, + "epoch": 0.37902483900643974, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036110079381614923, + "learning_rate": 1e-05, + "loss": 0.0834, + "num_tokens": 362027520.0, + "reward": 0.546875, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999338984489441, + "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05, + "sampling/sampling_logp_difference/max": 11.458046913146973, + "sampling/sampling_logp_difference/mean": 0.01939362846314907, + "step": 412 + }, + { + "clip_ratio/high_max": 3.112394779236638e-06, + "clip_ratio/high_mean": 7.780986948091595e-07, + "clip_ratio/low_mean": 5.127149995587388e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.204959859383962e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15830.0, + "completions/mean_length": 7344.9296875, + "completions/mean_terminated_length": 6900.384765625, + "completions/min_length": 1368.0, + "completions/min_terminated_length": 1368.0, + "entropy": 0.8387318029999733, + "epoch": 0.37994480220791166, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002141098491847515, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 362985207.0, + "reward": 0.34375, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322891235352, + "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05, + "sampling/sampling_logp_difference/max": 10.874617576599121, + "sampling/sampling_logp_difference/mean": 0.01929464004933834, + "step": 413 + }, + { + "clip_ratio/high_max": 5.2602786126954015e-06, + "clip_ratio/high_mean": 1.3150696531738504e-06, + "clip_ratio/low_mean": 1.7854434247510653e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9169503786997666e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16137.0, + "completions/mean_length": 6377.7734375, + "completions/mean_terminated_length": 6218.94482421875, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9732858911156654, + "epoch": 0.38086476540938363, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015244127716869116, + "learning_rate": 1e-05, + "loss": 0.0608, + "num_tokens": 363823914.0, + "reward": 0.4375, + "reward_std": 0.1988610327243805, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 0.006335465237498283, + "sampling/sampling_logp_difference/max": 5.061592102050781, + "sampling/sampling_logp_difference/mean": 0.020688029006123543, + "step": 414 + }, + { + "clip_ratio/high_max": 2.6195500595349586e-05, + "clip_ratio/high_mean": 6.548875148837396e-06, + "clip_ratio/low_mean": 3.3802934012783226e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035180882056011e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14456.0, + "completions/mean_length": 5599.7890625, + "completions/mean_terminated_length": 5340.96826171875, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8872368410229683, + "epoch": 0.38178472861085555, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002647512126713991, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 364561127.0, + "reward": 0.453125, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999077916145325, + "sampling/importance_sampling_ratio/min": 2.370526999584399e-06, + "sampling/sampling_logp_difference/max": 12.952398300170898, + "sampling/sampling_logp_difference/mean": 0.01878243312239647, + "step": 415 + }, + { + "clip_ratio/high_max": 2.157278959202813e-05, + "clip_ratio/high_mean": 5.3931973980070325e-06, + "clip_ratio/low_mean": 7.215861739950924e-05, + "clip_ratio/low_min": 1.4898997051204788e-05, + "clip_ratio/region_mean": 7.755181559332414e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15905.0, + "completions/mean_length": 7877.2890625, + "completions/mean_terminated_length": 7385.1650390625, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.8416353687644005, + "epoch": 0.3827046918123275, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018051012884825468, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 365590124.0, + "reward": 0.3125, + "reward_std": 0.28407180309295654, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.0004095165350008756, + "sampling/sampling_logp_difference/max": 7.800533294677734, + "sampling/sampling_logp_difference/mean": 0.019809434190392494, + "step": 416 + }, + { + "clip_ratio/high_max": 2.540994637456606e-05, + "clip_ratio/high_mean": 6.352486593641515e-06, + "clip_ratio/low_mean": 4.230594890941575e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8658435844117776e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16083.0, + "completions/mean_length": 6836.7890625, + "completions/mean_terminated_length": 6200.30859375, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.8647575601935387, + "epoch": 0.38362465501379944, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004550795070827007, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 366486337.0, + "reward": 0.40625, + "reward_std": 0.22620806097984314, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873638153076, + "sampling/importance_sampling_ratio/min": 0.0001089095021598041, + "sampling/sampling_logp_difference/max": 9.124993324279785, + "sampling/sampling_logp_difference/mean": 0.01992485672235489, + "step": 417 + }, + { + "clip_ratio/high_max": 1.1592664577619871e-05, + "clip_ratio/high_mean": 2.8981661444049678e-06, + "clip_ratio/low_mean": 3.5717548257707676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.861571451579948e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16286.0, + "completions/mean_length": 6884.953125, + "completions/mean_terminated_length": 6417.78662109375, + "completions/min_length": 1289.0, + "completions/min_terminated_length": 1289.0, + "entropy": 0.8691708743572235, + "epoch": 0.3845446182152714, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005958946421742439, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 367386163.0, + "reward": 0.5078125, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 9.519772902422119e-06, + "sampling/sampling_logp_difference/max": 11.562139511108398, + "sampling/sampling_logp_difference/mean": 0.019436441361904144, + "step": 418 + }, + { + "clip_ratio/high_max": 2.7658640192385064e-05, + "clip_ratio/high_mean": 8.455849524580117e-06, + "clip_ratio/low_mean": 3.938097847822064e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7836828116487595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15574.0, + "completions/mean_length": 7439.1328125, + "completions/mean_terminated_length": 7150.58837890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.795464999973774, + "epoch": 0.38546458141674333, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00558120384812355, + "learning_rate": 1e-05, + "loss": 0.1918, + "num_tokens": 368357500.0, + "reward": 0.609375, + "reward_std": 0.3795146346092224, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.0001159337698481977, + "sampling/sampling_logp_difference/max": 9.062491416931152, + "sampling/sampling_logp_difference/mean": 0.018824251368641853, + "step": 419 + }, + { + "clip_ratio/high_max": 8.509555527780321e-06, + "clip_ratio/high_mean": 2.1273888819450804e-06, + "clip_ratio/low_mean": 3.0958593640662e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.308598269313734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16236.0, + "completions/mean_length": 6751.53125, + "completions/mean_terminated_length": 6520.3525390625, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 0.9450879693031311, + "epoch": 0.38638454461821525, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004628168884664774, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 369242920.0, + "reward": 0.359375, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.0006074689445085824, + "sampling/sampling_logp_difference/max": 7.406209468841553, + "sampling/sampling_logp_difference/mean": 0.019376013427972794, + "step": 420 + }, + { + "clip_ratio/high_max": 1.8288420505996328e-05, + "clip_ratio/high_mean": 4.572105126499082e-06, + "clip_ratio/low_mean": 4.86290555272717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320115997164976e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16164.0, + "completions/mean_length": 7023.296875, + "completions/mean_terminated_length": 6315.3447265625, + "completions/min_length": 1628.0, + "completions/min_terminated_length": 1628.0, + "entropy": 0.7378111630678177, + "epoch": 0.3873045078196872, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00389425759203732, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 370159510.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999127388000488, + "sampling/importance_sampling_ratio/min": 0.00014012664905749261, + "sampling/sampling_logp_difference/max": 8.872963905334473, + "sampling/sampling_logp_difference/mean": 0.016914553940296173, + "step": 421 + }, + { + "clip_ratio/high_max": 2.1269573153404053e-05, + "clip_ratio/high_mean": 5.948400371380558e-06, + "clip_ratio/low_mean": 2.3538930747690756e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9487331687505502e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16018.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 7702.3046875, + "completions/mean_terminated_length": 7702.3046875, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.9053447172045708, + "epoch": 0.38822447102115915, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004324545152485371, + "learning_rate": 1e-05, + "loss": 0.0149, + "num_tokens": 371162773.0, + "reward": 0.2421875, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00001060962677, + "sampling/importance_sampling_ratio/min": 2.283278627146501e-05, + "sampling/sampling_logp_difference/max": 10.687313079833984, + "sampling/sampling_logp_difference/mean": 0.020495830103754997, + "step": 422 + }, + { + "clip_ratio/high_max": 1.0294916819475475e-05, + "clip_ratio/high_mean": 2.5737292048688687e-06, + "clip_ratio/low_mean": 5.831611520079605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.088984559937671e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 6904.78125, + "completions/mean_terminated_length": 6754.31787109375, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.7991176024079323, + "epoch": 0.3891444342226311, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003239463549107313, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 372067241.0, + "reward": 0.328125, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00012340991816017777, + "sampling/sampling_logp_difference/max": 8.999999046325684, + "sampling/sampling_logp_difference/mean": 0.019042208790779114, + "step": 423 + }, + { + "clip_ratio/high_max": 2.7261318791715894e-05, + "clip_ratio/high_mean": 7.926559305815317e-06, + "clip_ratio/low_mean": 1.552133551285806e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3447895273420727e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15399.0, + "completions/mean_length": 6107.7421875, + "completions/mean_terminated_length": 5602.35205078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.9495253190398216, + "epoch": 0.39006439742410304, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015464330790564418, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 372866072.0, + "reward": 0.421875, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971330165863, + "sampling/importance_sampling_ratio/min": 0.00024684349773451686, + "sampling/sampling_logp_difference/max": 8.306756019592285, + "sampling/sampling_logp_difference/mean": 0.019793221727013588, + "step": 424 + }, + { + "clip_ratio/high_max": 2.457227401464479e-05, + "clip_ratio/high_mean": 8.533324717063806e-06, + "clip_ratio/low_mean": 3.261690835643094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.115023284612107e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15939.0, + "completions/mean_length": 6079.8046875, + "completions/mean_terminated_length": 5747.4111328125, + "completions/min_length": 1082.0, + "completions/min_terminated_length": 1082.0, + "entropy": 0.8005363270640373, + "epoch": 0.39098436062557496, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024811832699924707, + "learning_rate": 1e-05, + "loss": 0.1124, + "num_tokens": 373663463.0, + "reward": 0.625, + "reward_std": 0.2630355656147003, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743103981018, + "sampling/importance_sampling_ratio/min": 0.00019348970090504736, + "sampling/sampling_logp_difference/max": 8.550286293029785, + "sampling/sampling_logp_difference/mean": 0.017151469364762306, + "step": 425 + }, + { + "clip_ratio/high_max": 3.3719989005476236e-06, + "clip_ratio/high_mean": 8.429997251369059e-07, + "clip_ratio/low_mean": 2.132218082806503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2165180553201935e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14925.0, + "completions/mean_length": 6453.7890625, + "completions/mean_terminated_length": 6375.5986328125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "entropy": 0.9212624430656433, + "epoch": 0.39190432382704693, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031475063879042864, + "learning_rate": 1e-05, + "loss": 0.0959, + "num_tokens": 374517492.0, + "reward": 0.34375, + "reward_std": 0.19910329580307007, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999594688415527, + "sampling/importance_sampling_ratio/min": 0.015664709731936455, + "sampling/sampling_logp_difference/max": 4.156344890594482, + "sampling/sampling_logp_difference/mean": 0.019899867475032806, + "step": 426 + }, + { + "clip_ratio/high_max": 1.907509408738406e-05, + "clip_ratio/high_mean": 5.984868664654641e-06, + "clip_ratio/low_mean": 3.784128080042137e-05, + "clip_ratio/low_min": 3.7751804029539926e-06, + "clip_ratio/region_mean": 4.382614952191943e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16159.0, + "completions/max_terminated_length": 16159.0, + "completions/mean_length": 6126.9921875, + "completions/mean_terminated_length": 6126.9921875, + "completions/min_length": 1106.0, + "completions/min_terminated_length": 1106.0, + "entropy": 0.8252849578857422, + "epoch": 0.39282428702851885, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004200868774205446, + "learning_rate": 1e-05, + "loss": 0.0276, + "num_tokens": 375320339.0, + "reward": 0.4140625, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999815225601196, + "sampling/importance_sampling_ratio/min": 0.005763276945799589, + "sampling/sampling_logp_difference/max": 5.156249046325684, + "sampling/sampling_logp_difference/mean": 0.01833093911409378, + "step": 427 + }, + { + "clip_ratio/high_max": 1.8918785372079583e-05, + "clip_ratio/high_mean": 5.476571459439583e-06, + "clip_ratio/low_mean": 6.169724406390742e-05, + "clip_ratio/low_min": 7.494657666029525e-06, + "clip_ratio/region_mean": 6.717381506859965e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15411.0, + "completions/mean_length": 6739.09375, + "completions/mean_terminated_length": 6427.9677734375, + "completions/min_length": 1228.0, + "completions/min_terminated_length": 1228.0, + "entropy": 0.8008574098348618, + "epoch": 0.3937442502299908, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003204014617949724, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 376201015.0, + "reward": 0.5390625, + "reward_std": 0.37086254358291626, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998303651809692, + "sampling/importance_sampling_ratio/min": 0.00010144581028725952, + "sampling/sampling_logp_difference/max": 9.195985794067383, + "sampling/sampling_logp_difference/mean": 0.018961725756525993, + "step": 428 + }, + { + "clip_ratio/high_max": 1.3558789078160771e-05, + "clip_ratio/high_mean": 3.389697269540193e-06, + "clip_ratio/low_mean": 5.3925050679026754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.731474743697618e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15634.0, + "completions/mean_length": 7245.8984375, + "completions/mean_terminated_length": 6951.12060546875, + "completions/min_length": 1306.0, + "completions/min_terminated_length": 1306.0, + "entropy": 1.0351596996188164, + "epoch": 0.39466421343146274, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0039763906970620155, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 377149650.0, + "reward": 0.375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000600814819336, + "sampling/importance_sampling_ratio/min": 8.106228051474318e-05, + "sampling/sampling_logp_difference/max": 9.420292854309082, + "sampling/sampling_logp_difference/mean": 0.020948028191924095, + "step": 429 + }, + { + "clip_ratio/high_max": 1.4580486549675697e-05, + "clip_ratio/high_mean": 4.259903903403028e-06, + "clip_ratio/low_mean": 4.6149686397711775e-05, + "clip_ratio/low_min": 3.006686938533676e-06, + "clip_ratio/region_mean": 5.04095905853319e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 6958.625, + "completions/mean_terminated_length": 6495.08154296875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.8360240310430527, + "epoch": 0.39558417663293466, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0031417158897966146, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 378057802.0, + "reward": 0.515625, + "reward_std": 0.35771697759628296, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999384880065918, + "sampling/importance_sampling_ratio/min": 0.00010235882655251771, + "sampling/sampling_logp_difference/max": 9.187026023864746, + "sampling/sampling_logp_difference/mean": 0.019185224547982216, + "step": 430 + }, + { + "clip_ratio/high_max": 6.681633749394678e-06, + "clip_ratio/high_mean": 1.6704084373486694e-06, + "clip_ratio/low_mean": 5.096616632727091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.263657521936693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15410.0, + "completions/max_terminated_length": 15410.0, + "completions/mean_length": 5696.3984375, + "completions/mean_terminated_length": 5696.3984375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.7887749597430229, + "epoch": 0.39650413983440663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004943124484270811, + "learning_rate": 1e-05, + "loss": 0.096, + "num_tokens": 378808021.0, + "reward": 0.515625, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999057054519653, + "sampling/importance_sampling_ratio/min": 0.0015042300801724195, + "sampling/sampling_logp_difference/max": 6.499474048614502, + "sampling/sampling_logp_difference/mean": 0.018845941871404648, + "step": 431 + }, + { + "clip_ratio/high_max": 1.7526824194646906e-05, + "clip_ratio/high_mean": 5.417880970526312e-06, + "clip_ratio/low_mean": 3.513921649300755e-05, + "clip_ratio/low_min": 6.075038982089609e-06, + "clip_ratio/region_mean": 4.0557096895099676e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14233.0, + "completions/mean_length": 6480.8828125, + "completions/mean_terminated_length": 6323.69091796875, + "completions/min_length": 1013.0, + "completions/min_terminated_length": 1013.0, + "entropy": 0.8796411231160164, + "epoch": 0.39742410303587855, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00595651101320982, + "learning_rate": 1e-05, + "loss": 0.0546, + "num_tokens": 379659710.0, + "reward": 0.3984375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 0.0017907419241964817, + "sampling/sampling_logp_difference/max": 6.325125217437744, + "sampling/sampling_logp_difference/mean": 0.01906527951359749, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4512424602107785e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4512424602107785e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7501.703125, + "completions/mean_terminated_length": 6829.93310546875, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "entropy": 0.786028303205967, + "epoch": 0.3983440662373505, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0024527597706764936, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 380640720.0, + "reward": 0.5234375, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999595880508423, + "sampling/importance_sampling_ratio/min": 8.851602615322918e-07, + "sampling/sampling_logp_difference/max": 13.93749713897705, + "sampling/sampling_logp_difference/mean": 0.01873261108994484, + "step": 433 + }, + { + "clip_ratio/high_max": 1.4606259583160863e-05, + "clip_ratio/high_mean": 5.505394312876888e-06, + "clip_ratio/low_mean": 3.1679782978244475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7185177234277944e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15185.0, + "completions/mean_length": 5619.2890625, + "completions/mean_terminated_length": 5448.4208984375, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.8098893761634827, + "epoch": 0.39926402943882244, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004280989523977041, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 381377981.0, + "reward": 0.609375, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.0010248658945783973, + "sampling/sampling_logp_difference/max": 6.883193492889404, + "sampling/sampling_logp_difference/mean": 0.017923470586538315, + "step": 434 + }, + { + "clip_ratio/high_max": 1.4808703554081148e-05, + "clip_ratio/high_mean": 3.702175888520287e-06, + "clip_ratio/low_mean": 2.3637440563106793e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7339616224253405e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5243.8203125, + "completions/mean_terminated_length": 5156.1025390625, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "entropy": 0.7485036551952362, + "epoch": 0.40018399264029436, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004721642471849918, + "learning_rate": 1e-05, + "loss": 0.0877, + "num_tokens": 382070478.0, + "reward": 0.6875, + "reward_std": 0.26538965106010437, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999414086341858, + "sampling/importance_sampling_ratio/min": 0.0011518355458974838, + "sampling/sampling_logp_difference/max": 6.7663984298706055, + "sampling/sampling_logp_difference/mean": 0.016579966992139816, + "step": 435 + }, + { + "clip_ratio/high_max": 3.1177480195765384e-05, + "clip_ratio/high_mean": 1.1174359769938746e-05, + "clip_ratio/low_mean": 3.602651599976525e-05, + "clip_ratio/low_min": 4.348733455117326e-06, + "clip_ratio/region_mean": 4.720087713394605e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15978.0, + "completions/mean_length": 7021.1796875, + "completions/mean_terminated_length": 6872.56396484375, + "completions/min_length": 1371.0, + "completions/min_terminated_length": 1371.0, + "entropy": 0.8693460151553154, + "epoch": 0.40110395584176634, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00329192029312253, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 382990245.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.0023386883549392223, + "sampling/sampling_logp_difference/max": 6.058165073394775, + "sampling/sampling_logp_difference/mean": 0.019863136112689972, + "step": 436 + }, + { + "clip_ratio/high_max": 1.1192694955752813e-05, + "clip_ratio/high_mean": 2.7981737389382033e-06, + "clip_ratio/low_mean": 4.9078003257818636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.1876177280973934e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15344.0, + "completions/mean_length": 6917.625, + "completions/mean_terminated_length": 6452.0654296875, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "entropy": 0.8466897681355476, + "epoch": 0.40202391904323825, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0051889242604374886, + "learning_rate": 1e-05, + "loss": 0.1009, + "num_tokens": 383896717.0, + "reward": 0.4140625, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999983310699463, + "sampling/importance_sampling_ratio/min": 0.00015846389578655362, + "sampling/sampling_logp_difference/max": 8.749983787536621, + "sampling/sampling_logp_difference/mean": 0.019528398290276527, + "step": 437 + }, + { + "clip_ratio/high_max": 2.3224948108691024e-05, + "clip_ratio/high_mean": 8.263948757303297e-06, + "clip_ratio/low_mean": 3.8556312347282073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.682026019509067e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16175.0, + "completions/mean_length": 7487.5078125, + "completions/mean_terminated_length": 7346.2939453125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.9584660083055496, + "epoch": 0.4029438822447102, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002855573548004031, + "learning_rate": 1e-05, + "loss": 0.0087, + "num_tokens": 384872622.0, + "reward": 0.3828125, + "reward_std": 0.2477683424949646, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999386668205261, + "sampling/importance_sampling_ratio/min": 0.0038593418430536985, + "sampling/sampling_logp_difference/max": 5.557258605957031, + "sampling/sampling_logp_difference/mean": 0.0209865253418684, + "step": 438 + }, + { + "clip_ratio/high_max": 6.171620498207631e-06, + "clip_ratio/high_mean": 1.5429051245519076e-06, + "clip_ratio/low_mean": 2.98128834401723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.135578845103737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16092.0, + "completions/mean_length": 6637.5078125, + "completions/mean_terminated_length": 6323.1044921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 0.8841215297579765, + "epoch": 0.40386384544618215, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004437311552464962, + "learning_rate": 1e-05, + "loss": 0.0523, + "num_tokens": 385744023.0, + "reward": 0.3984375, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999136924743652, + "sampling/importance_sampling_ratio/min": 0.002925124252215028, + "sampling/sampling_logp_difference/max": 5.834418296813965, + "sampling/sampling_logp_difference/mean": 0.019490888342261314, + "step": 439 + }, + { + "clip_ratio/high_max": 1.3304874300956726e-05, + "clip_ratio/high_mean": 3.3262185752391815e-06, + "clip_ratio/low_mean": 5.443932013804442e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.776553894065728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15143.0, + "completions/mean_length": 5965.9765625, + "completions/mean_terminated_length": 5800.611328125, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "entropy": 0.8726934269070625, + "epoch": 0.4047838086476541, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002463799435645342, + "learning_rate": 1e-05, + "loss": -0.0075, + "num_tokens": 386525492.0, + "reward": 0.3984375, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.00020367901015561074, + "sampling/sampling_logp_difference/max": 8.4989652633667, + "sampling/sampling_logp_difference/mean": 0.01946769654750824, + "step": 440 + }, + { + "clip_ratio/high_max": 1.0084711902891286e-05, + "clip_ratio/high_mean": 3.6154040117253317e-06, + "clip_ratio/low_mean": 3.598771945689805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9603123695997056e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6693.109375, + "completions/mean_terminated_length": 6616.80322265625, + "completions/min_length": 1704.0, + "completions/min_terminated_length": 1704.0, + "entropy": 0.9430640190839767, + "epoch": 0.40570377184912604, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038990566972643137, + "learning_rate": 1e-05, + "loss": 0.0415, + "num_tokens": 387404842.0, + "reward": 0.421875, + "reward_std": 0.31587693095207214, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999700784683228, + "sampling/importance_sampling_ratio/min": 0.0011708902893587947, + "sampling/sampling_logp_difference/max": 6.749990940093994, + "sampling/sampling_logp_difference/mean": 0.020848294720053673, + "step": 441 + }, + { + "clip_ratio/high_max": 7.462686426151777e-06, + "clip_ratio/high_mean": 1.8656716065379442e-06, + "clip_ratio/low_mean": 5.234285907818048e-05, + "clip_ratio/low_min": 4.47803950009984e-06, + "clip_ratio/region_mean": 5.420853057103159e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7045.6953125, + "completions/mean_terminated_length": 6505.46240234375, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "entropy": 0.8912066072225571, + "epoch": 0.40662373505059796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018510994268581271, + "learning_rate": 1e-05, + "loss": 0.099, + "num_tokens": 388324475.0, + "reward": 0.40625, + "reward_std": 0.32195523381233215, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999024868011475, + "sampling/importance_sampling_ratio/min": 0.0031757301185280085, + "sampling/sampling_logp_difference/max": 5.752217769622803, + "sampling/sampling_logp_difference/mean": 0.020547039806842804, + "step": 442 + }, + { + "clip_ratio/high_max": 2.504527083146968e-05, + "clip_ratio/high_mean": 6.26131770786742e-06, + "clip_ratio/low_mean": 6.165269871871715e-05, + "clip_ratio/low_min": 3.5272871627967106e-06, + "clip_ratio/region_mean": 6.791401551708987e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15734.0, + "completions/mean_length": 7480.0078125, + "completions/mean_terminated_length": 7266.3125, + "completions/min_length": 1130.0, + "completions/min_terminated_length": 1130.0, + "entropy": 0.8813760280609131, + "epoch": 0.40754369825206993, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004439481534063816, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 389305644.0, + "reward": 0.34375, + "reward_std": 0.31300368905067444, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999762773513794, + "sampling/importance_sampling_ratio/min": 0.007449973840266466, + "sampling/sampling_logp_difference/max": 4.899544715881348, + "sampling/sampling_logp_difference/mean": 0.01973455585539341, + "step": 443 + }, + { + "clip_ratio/high_max": 4.0980917219712865e-06, + "clip_ratio/high_mean": 1.0245229304928216e-06, + "clip_ratio/low_mean": 3.662567087303614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.76501939172158e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15302.0, + "completions/max_terminated_length": 15302.0, + "completions/mean_length": 7044.4453125, + "completions/mean_terminated_length": 7044.4453125, + "completions/min_length": 1229.0, + "completions/min_terminated_length": 1229.0, + "entropy": 0.9901906549930573, + "epoch": 0.40846366145354185, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004181519150733948, + "learning_rate": 1e-05, + "loss": -0.0068, + "num_tokens": 390229373.0, + "reward": 0.421875, + "reward_std": 0.17700131237506866, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000314712524414, + "sampling/importance_sampling_ratio/min": 0.00022536676260642707, + "sampling/sampling_logp_difference/max": 8.397781372070312, + "sampling/sampling_logp_difference/mean": 0.021211043000221252, + "step": 444 + }, + { + "clip_ratio/high_max": 1.4909872106727562e-05, + "clip_ratio/high_mean": 3.7274680266818905e-06, + "clip_ratio/low_mean": 5.29995777469594e-05, + "clip_ratio/low_min": 3.708758640641463e-06, + "clip_ratio/region_mean": 5.672704537573736e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7815.8125, + "completions/mean_terminated_length": 7244.6005859375, + "completions/min_length": 1350.0, + "completions/min_terminated_length": 1350.0, + "entropy": 0.8278292864561081, + "epoch": 0.4093836246550138, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002691390924155712, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 391251141.0, + "reward": 0.3515625, + "reward_std": 0.31222954392433167, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 0.007715471088886261, + "sampling/sampling_logp_difference/max": 4.864527702331543, + "sampling/sampling_logp_difference/mean": 0.018415704369544983, + "step": 445 + }, + { + "clip_ratio/high_max": 2.1858722902834415e-05, + "clip_ratio/high_mean": 6.629899417021079e-06, + "clip_ratio/low_mean": 3.196247394043894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.859237290271267e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15202.0, + "completions/mean_length": 5305.1796875, + "completions/mean_terminated_length": 5217.94482421875, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.8100772425532341, + "epoch": 0.41030358785648574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0069543467834591866, + "learning_rate": 1e-05, + "loss": 0.1153, + "num_tokens": 391956196.0, + "reward": 0.609375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000190734863281, + "sampling/importance_sampling_ratio/min": 0.0024869756307452917, + "sampling/sampling_logp_difference/max": 5.996687889099121, + "sampling/sampling_logp_difference/mean": 0.017318082973361015, + "step": 446 + }, + { + "clip_ratio/high_max": 2.461934036546154e-05, + "clip_ratio/high_mean": 8.056288947955181e-06, + "clip_ratio/low_mean": 5.289376917971822e-05, + "clip_ratio/low_min": 4.21926688431995e-06, + "clip_ratio/region_mean": 6.0950058468733914e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15300.0, + "completions/mean_length": 7299.578125, + "completions/mean_terminated_length": 6930.29248046875, + "completions/min_length": 1008.0, + "completions/min_terminated_length": 1008.0, + "entropy": 0.9955824315547943, + "epoch": 0.41122355105795766, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0065611582249403, + "learning_rate": 1e-05, + "loss": 0.0883, + "num_tokens": 392908430.0, + "reward": 0.4375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999696016311646, + "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06, + "sampling/sampling_logp_difference/max": 11.873339653015137, + "sampling/sampling_logp_difference/mean": 0.02127375639975071, + "step": 447 + }, + { + "clip_ratio/high_max": 2.4339562514796853e-05, + "clip_ratio/high_mean": 7.412756531266496e-06, + "clip_ratio/low_mean": 3.89272447591793e-05, + "clip_ratio/low_min": 4.047796210215893e-06, + "clip_ratio/region_mean": 4.6340001517819474e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 6702.9375, + "completions/mean_terminated_length": 6390.64501953125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.82919991761446, + "epoch": 0.41214351425942963, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032975098583847284, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 393788286.0, + "reward": 0.4609375, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.00028582560480572283, + "sampling/sampling_logp_difference/max": 8.160128593444824, + "sampling/sampling_logp_difference/mean": 0.019461583346128464, + "step": 448 + }, + { + "clip_ratio/high_max": 2.3807599063729867e-05, + "clip_ratio/high_mean": 5.951899765932467e-06, + "clip_ratio/low_mean": 3.195798365140945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.790988330365508e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15244.0, + "completions/mean_length": 6468.9453125, + "completions/mean_terminated_length": 5536.7607421875, + "completions/min_length": 808.0, + "completions/min_terminated_length": 808.0, + "entropy": 0.6471721827983856, + "epoch": 0.41306347746090155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032787907402962446, + "learning_rate": 1e-05, + "loss": 0.1149, + "num_tokens": 394638159.0, + "reward": 0.625, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.00012341380352154374, + "sampling/sampling_logp_difference/max": 8.999967575073242, + "sampling/sampling_logp_difference/mean": 0.016151495277881622, + "step": 449 + }, + { + "clip_ratio/high_max": 2.247072688987828e-05, + "clip_ratio/high_mean": 5.61768172246957e-06, + "clip_ratio/low_mean": 6.035319393049576e-05, + "clip_ratio/low_min": 4.063190772285452e-06, + "clip_ratio/region_mean": 6.597087667614687e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15931.0, + "completions/mean_length": 6547.3203125, + "completions/mean_terminated_length": 6230.0078125, + "completions/min_length": 587.0, + "completions/min_terminated_length": 587.0, + "entropy": 0.9123960956931114, + "epoch": 0.4139834406623735, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038375966250896454, + "learning_rate": 1e-05, + "loss": 0.0967, + "num_tokens": 395493872.0, + "reward": 0.4296875, + "reward_std": 0.30798619985580444, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.00016009423416107893, + "sampling/sampling_logp_difference/max": 8.739748001098633, + "sampling/sampling_logp_difference/mean": 0.019957344979047775, + "step": 450 + }, + { + "clip_ratio/high_max": 1.404482372890925e-05, + "clip_ratio/high_mean": 3.5112059322273126e-06, + "clip_ratio/low_mean": 2.315102483407827e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6662230766305584e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15058.0, + "completions/mean_length": 6291.859375, + "completions/mean_terminated_length": 6131.6669921875, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 0.9841655194759369, + "epoch": 0.41490340386384544, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003903903067111969, + "learning_rate": 1e-05, + "loss": 0.0656, + "num_tokens": 396320254.0, + "reward": 0.4296875, + "reward_std": 0.2569621503353119, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 6.564632712979801e-06, + "sampling/sampling_logp_difference/max": 11.93381404876709, + "sampling/sampling_logp_difference/mean": 0.020753150805830956, + "step": 451 + }, + { + "clip_ratio/high_max": 1.5189204987109406e-05, + "clip_ratio/high_mean": 4.615214265868417e-06, + "clip_ratio/low_mean": 3.547988831087423e-05, + "clip_ratio/low_min": 3.3967392027989263e-06, + "clip_ratio/region_mean": 4.009510257674265e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15966.0, + "completions/mean_length": 7692.4296875, + "completions/mean_terminated_length": 7339.11376953125, + "completions/min_length": 1269.0, + "completions/min_terminated_length": 1269.0, + "entropy": 0.94080401211977, + "epoch": 0.41582336706531736, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005152889993041754, + "learning_rate": 1e-05, + "loss": 0.0511, + "num_tokens": 397327029.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 5.027571751270443e-05, + "sampling/sampling_logp_difference/max": 9.897988319396973, + "sampling/sampling_logp_difference/mean": 0.02036213129758835, + "step": 452 + }, + { + "clip_ratio/high_max": 1.733157705530175e-05, + "clip_ratio/high_mean": 6.0586507970583625e-06, + "clip_ratio/low_mean": 2.335082047011383e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9409470812424843e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15305.0, + "completions/mean_length": 6968.0859375, + "completions/mean_terminated_length": 6742.1044921875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9254838973283768, + "epoch": 0.41674333026678934, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035838852636516094, + "learning_rate": 1e-05, + "loss": 0.0182, + "num_tokens": 398237536.0, + "reward": 0.484375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.002404628787189722, + "sampling/sampling_logp_difference/max": 6.030359745025635, + "sampling/sampling_logp_difference/mean": 0.020200733095407486, + "step": 453 + }, + { + "clip_ratio/high_max": 4.464923677005572e-06, + "clip_ratio/high_mean": 1.116230919251393e-06, + "clip_ratio/low_mean": 3.311113533754906e-05, + "clip_ratio/low_min": 6.725854291289579e-06, + "clip_ratio/region_mean": 3.422736637048729e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16309.0, + "completions/mean_length": 8711.078125, + "completions/mean_terminated_length": 8199.55078125, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "entropy": 0.8735406622290611, + "epoch": 0.41766329346826125, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0036290446296334267, + "learning_rate": 1e-05, + "loss": 0.0412, + "num_tokens": 399373298.0, + "reward": 0.359375, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000042200088501, + "sampling/importance_sampling_ratio/min": 9.216561011271551e-05, + "sampling/sampling_logp_difference/max": 9.291923522949219, + "sampling/sampling_logp_difference/mean": 0.0201371181756258, + "step": 454 + }, + { + "clip_ratio/high_max": 3.4702664606811595e-05, + "clip_ratio/high_mean": 8.675666151702899e-06, + "clip_ratio/low_mean": 3.3217100849469716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.189276808119757e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14737.0, + "completions/mean_length": 6891.078125, + "completions/mean_terminated_length": 6663.24853515625, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "entropy": 0.8689641878008842, + "epoch": 0.41858325666973323, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004067540634423494, + "learning_rate": 1e-05, + "loss": 0.0633, + "num_tokens": 400273708.0, + "reward": 0.484375, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999425411224365, + "sampling/importance_sampling_ratio/min": 4.0002717582865444e-07, + "sampling/sampling_logp_difference/max": 14.731733322143555, + "sampling/sampling_logp_difference/mean": 0.019800148904323578, + "step": 455 + }, + { + "clip_ratio/high_max": 2.939170826721238e-06, + "clip_ratio/high_mean": 7.347927066803095e-07, + "clip_ratio/low_mean": 3.564125790944672e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6376050502440194e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15234.0, + "completions/mean_length": 6899.3515625, + "completions/mean_terminated_length": 6748.8017578125, + "completions/min_length": 1149.0, + "completions/min_terminated_length": 1149.0, + "entropy": 0.9442604705691338, + "epoch": 0.41950321987120515, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026191689539700747, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 401177497.0, + "reward": 0.46875, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 0.0017910725437104702, + "sampling/sampling_logp_difference/max": 6.3249406814575195, + "sampling/sampling_logp_difference/mean": 0.021380646154284477, + "step": 456 + }, + { + "clip_ratio/high_max": 8.99604128790088e-06, + "clip_ratio/high_mean": 2.24901032197522e-06, + "clip_ratio/low_mean": 2.57235833487357e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.797259367071092e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16226.0, + "completions/mean_length": 7175.8359375, + "completions/mean_terminated_length": 7029.6748046875, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.8653769046068192, + "epoch": 0.4204231830726771, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003141516586765647, + "learning_rate": 1e-05, + "loss": 0.0674, + "num_tokens": 402115812.0, + "reward": 0.4375, + "reward_std": 0.21040895581245422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999862909317017, + "sampling/importance_sampling_ratio/min": 0.001265019178390503, + "sampling/sampling_logp_difference/max": 6.672667980194092, + "sampling/sampling_logp_difference/mean": 0.01970163732767105, + "step": 457 + }, + { + "clip_ratio/high_max": 1.0800059499160852e-05, + "clip_ratio/high_mean": 2.700014874790213e-06, + "clip_ratio/low_mean": 3.116219727417047e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3862211807900167e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 7090.8515625, + "completions/mean_terminated_length": 6791.072265625, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.9437825232744217, + "epoch": 0.42134314627414904, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001980370609089732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 403048385.0, + "reward": 0.4609375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 1.4011449138706666e-06, + "sampling/sampling_logp_difference/max": 13.47822093963623, + "sampling/sampling_logp_difference/mean": 0.021090596914291382, + "step": 458 + }, + { + "clip_ratio/high_max": 2.5482850560365478e-05, + "clip_ratio/high_mean": 6.370712640091369e-06, + "clip_ratio/low_mean": 4.8558076969129615e-05, + "clip_ratio/low_min": 4.8952420002024155e-06, + "clip_ratio/region_mean": 5.4928788131292094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16175.0, + "completions/mean_length": 7033.65625, + "completions/mean_terminated_length": 6809.24853515625, + "completions/min_length": 1007.0, + "completions/min_terminated_length": 1007.0, + "entropy": 0.8789731040596962, + "epoch": 0.42226310947562096, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003833206370472908, + "learning_rate": 1e-05, + "loss": 0.059, + "num_tokens": 403968037.0, + "reward": 0.46875, + "reward_std": 0.28460076451301575, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000317096710205, + "sampling/importance_sampling_ratio/min": 0.0021942879538983107, + "sampling/sampling_logp_difference/max": 6.1218976974487305, + "sampling/sampling_logp_difference/mean": 0.019913772121071815, + "step": 459 + }, + { + "clip_ratio/high_max": 4.068877842655638e-06, + "clip_ratio/high_mean": 1.0172194606639096e-06, + "clip_ratio/low_mean": 6.774969961043098e-05, + "clip_ratio/low_min": 3.189914878021227e-06, + "clip_ratio/region_mean": 6.876691895740805e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 6992.8984375, + "completions/mean_terminated_length": 6611.14599609375, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "entropy": 0.857115626335144, + "epoch": 0.42318307267709293, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005315023008733988, + "learning_rate": 1e-05, + "loss": 0.1581, + "num_tokens": 404881584.0, + "reward": 0.3515625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000758171081543, + "sampling/importance_sampling_ratio/min": 4.546630952972919e-05, + "sampling/sampling_logp_difference/max": 9.998538970947266, + "sampling/sampling_logp_difference/mean": 0.01872519962489605, + "step": 460 + }, + { + "clip_ratio/high_max": 1.167047457784065e-05, + "clip_ratio/high_mean": 2.9176186444601626e-06, + "clip_ratio/low_mean": 3.3195502112448594e-05, + "clip_ratio/low_min": 5.25188033861923e-06, + "clip_ratio/region_mean": 3.611312064322192e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 6623.2578125, + "completions/mean_terminated_length": 6226.4794921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.8803941905498505, + "epoch": 0.42410303587856485, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0074885934591293335, + "learning_rate": 1e-05, + "loss": 0.1076, + "num_tokens": 405749105.0, + "reward": 0.515625, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.0011723897187039256, + "sampling/sampling_logp_difference/max": 6.748711109161377, + "sampling/sampling_logp_difference/mean": 0.01930626854300499, + "step": 461 + }, + { + "clip_ratio/high_max": 4.11753080697963e-06, + "clip_ratio/high_mean": 1.0293827017449075e-06, + "clip_ratio/low_mean": 5.09268712676203e-05, + "clip_ratio/low_min": 1.1170248626513057e-05, + "clip_ratio/region_mean": 5.195625465148623e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15032.0, + "completions/mean_length": 7244.8203125, + "completions/mean_terminated_length": 6647.5419921875, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.9202689751982689, + "epoch": 0.4250229990800368, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003960717935115099, + "learning_rate": 1e-05, + "loss": 0.0536, + "num_tokens": 406704618.0, + "reward": 0.484375, + "reward_std": 0.2880108058452606, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 1.69715603988152e-05, + "sampling/sampling_logp_difference/max": 10.98397159576416, + "sampling/sampling_logp_difference/mean": 0.02019711770117283, + "step": 462 + }, + { + "clip_ratio/high_max": 2.874629831239872e-05, + "clip_ratio/high_mean": 1.0519701334033016e-05, + "clip_ratio/low_mean": 5.367962035052187e-05, + "clip_ratio/low_min": 6.5083827394119e-06, + "clip_ratio/region_mean": 6.419932219614566e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 7462.0546875, + "completions/mean_terminated_length": 6867.2587890625, + "completions/min_length": 669.0, + "completions/min_terminated_length": 669.0, + "entropy": 0.8141553401947021, + "epoch": 0.42594296228150874, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003602087963372469, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 407677177.0, + "reward": 0.421875, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999440312385559, + "sampling/importance_sampling_ratio/min": 0.0007806668290868402, + "sampling/sampling_logp_difference/max": 7.155362129211426, + "sampling/sampling_logp_difference/mean": 0.01856713369488716, + "step": 463 + }, + { + "clip_ratio/high_max": 2.6413443720230134e-05, + "clip_ratio/high_mean": 8.973188073468918e-06, + "clip_ratio/low_mean": 3.5997712757307454e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.497090230870526e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15750.0, + "completions/mean_length": 6683.1796875, + "completions/mean_terminated_length": 6529.19873046875, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "entropy": 0.9070071652531624, + "epoch": 0.42686292548298066, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004038481041789055, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 408552512.0, + "reward": 0.4609375, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 4.474630986806005e-05, + "sampling/sampling_logp_difference/max": 10.014501571655273, + "sampling/sampling_logp_difference/mean": 0.02077356167137623, + "step": 464 + }, + { + "clip_ratio/high_max": 1.7171289982798044e-05, + "clip_ratio/high_mean": 4.292822495699511e-06, + "clip_ratio/low_mean": 3.225401701456576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654683996501262e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15864.0, + "completions/mean_length": 6472.9453125, + "completions/mean_terminated_length": 5985.51611328125, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8807859197258949, + "epoch": 0.42778288868445263, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004457853268831968, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 409399257.0, + "reward": 0.421875, + "reward_std": 0.20517179369926453, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 0.0017577135004103184, + "sampling/sampling_logp_difference/max": 6.343741416931152, + "sampling/sampling_logp_difference/mean": 0.020475786179304123, + "step": 465 + }, + { + "clip_ratio/high_max": 5.442162637336878e-05, + "clip_ratio/high_mean": 1.584139977239829e-05, + "clip_ratio/low_mean": 5.706528349946893e-05, + "clip_ratio/low_min": 2.5156462925224332e-05, + "clip_ratio/region_mean": 7.290668463610928e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15896.0, + "completions/mean_length": 5989.78125, + "completions/mean_terminated_length": 5654.48388671875, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.8479711338877678, + "epoch": 0.42870285188592455, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033953245729207993, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 410185645.0, + "reward": 0.5, + "reward_std": 0.3735082745552063, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 1.781588616722729e-05, + "sampling/sampling_logp_difference/max": 10.935420036315918, + "sampling/sampling_logp_difference/mean": 0.017986344173550606, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.2673244681500364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2673244681500364e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 8299.9453125, + "completions/mean_terminated_length": 8171.62744140625, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "entropy": 0.9363152608275414, + "epoch": 0.4296228150873965, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002381247701123357, + "learning_rate": 1e-05, + "loss": 0.0651, + "num_tokens": 411268974.0, + "reward": 0.2890625, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 0.000553094083443284, + "sampling/sampling_logp_difference/max": 7.4999823570251465, + "sampling/sampling_logp_difference/mean": 0.021354343742132187, + "step": 467 + }, + { + "clip_ratio/high_max": 8.578695997130126e-06, + "clip_ratio/high_mean": 2.1446739992825314e-06, + "clip_ratio/low_mean": 2.84454882830687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.059016239603807e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14838.0, + "completions/mean_length": 7434.0546875, + "completions/mean_terminated_length": 7219.25634765625, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "entropy": 0.981913685798645, + "epoch": 0.43054277828886844, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006341467145830393, + "learning_rate": 1e-05, + "loss": -0.003, + "num_tokens": 412238117.0, + "reward": 0.390625, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 0.0019304680172353983, + "sampling/sampling_logp_difference/max": 6.249992847442627, + "sampling/sampling_logp_difference/mean": 0.02139873616397381, + "step": 468 + }, + { + "clip_ratio/high_max": 1.7187987396027893e-05, + "clip_ratio/high_mean": 5.150076049176278e-06, + "clip_ratio/low_mean": 5.4699471832009294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.9849548279089504e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15871.0, + "completions/mean_length": 7211.1796875, + "completions/mean_terminated_length": 7138.95263671875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "entropy": 0.9307222217321396, + "epoch": 0.43146274149034036, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002621602965518832, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 413182860.0, + "reward": 0.3203125, + "reward_std": 0.34716784954071045, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999529123306274, + "sampling/importance_sampling_ratio/min": 5.1446182624204084e-05, + "sampling/sampling_logp_difference/max": 9.874974250793457, + "sampling/sampling_logp_difference/mean": 0.020250719040632248, + "step": 469 + }, + { + "clip_ratio/high_max": 1.0867412584047997e-05, + "clip_ratio/high_mean": 3.9217885614561965e-06, + "clip_ratio/low_mean": 4.7740833792886406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.16626223543426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15726.0, + "completions/mean_length": 5349.4296875, + "completions/mean_terminated_length": 5174.2783203125, + "completions/min_length": 983.0, + "completions/min_terminated_length": 983.0, + "entropy": 1.0213474333286285, + "epoch": 0.43238270469181234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035241330042481422, + "learning_rate": 1e-05, + "loss": 0.0657, + "num_tokens": 413885963.0, + "reward": 0.3046875, + "reward_std": 0.25330984592437744, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.0003569081309251487, + "sampling/sampling_logp_difference/max": 7.938032150268555, + "sampling/sampling_logp_difference/mean": 0.01975759118795395, + "step": 470 + }, + { + "clip_ratio/high_max": 1.469514609198086e-05, + "clip_ratio/high_mean": 3.673786522995215e-06, + "clip_ratio/low_mean": 2.699725871480041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0671045237795624e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15357.0, + "completions/mean_length": 7542.8515625, + "completions/mean_terminated_length": 7257.65283203125, + "completions/min_length": 1359.0, + "completions/min_terminated_length": 1359.0, + "entropy": 0.8882969543337822, + "epoch": 0.43330266789328425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014164346503093839, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 414870560.0, + "reward": 0.3671875, + "reward_std": 0.20753081142902374, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000402927398682, + "sampling/importance_sampling_ratio/min": 6.435441900976002e-05, + "sampling/sampling_logp_difference/max": 9.651104927062988, + "sampling/sampling_logp_difference/mean": 0.020874422043561935, + "step": 471 + }, + { + "clip_ratio/high_max": 1.669827497607912e-05, + "clip_ratio/high_mean": 4.17456874401978e-06, + "clip_ratio/low_mean": 3.673103901746799e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.090560787517461e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7286.90625, + "completions/mean_terminated_length": 6993.451171875, + "completions/min_length": 977.0, + "completions/min_terminated_length": 977.0, + "entropy": 0.9254636988043785, + "epoch": 0.43422263109475623, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026956009678542614, + "learning_rate": 1e-05, + "loss": 0.0567, + "num_tokens": 415825252.0, + "reward": 0.328125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999917209148407, + "sampling/importance_sampling_ratio/min": 0.0019701423589140177, + "sampling/sampling_logp_difference/max": 6.229649543762207, + "sampling/sampling_logp_difference/mean": 0.0202642735093832, + "step": 472 + }, + { + "clip_ratio/high_max": 9.162045444099931e-06, + "clip_ratio/high_mean": 2.2905113610249828e-06, + "clip_ratio/low_mean": 3.818475033767754e-05, + "clip_ratio/low_min": 7.20606476534158e-06, + "clip_ratio/region_mean": 4.047526181238936e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15908.0, + "completions/mean_length": 7244.7421875, + "completions/mean_terminated_length": 6716.0244140625, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.7817923128604889, + "epoch": 0.43514259429622815, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022128887940198183, + "learning_rate": 1e-05, + "loss": 0.0577, + "num_tokens": 416774011.0, + "reward": 0.453125, + "reward_std": 0.2937847375869751, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0015034435782581568, + "sampling/sampling_logp_difference/max": 6.499997138977051, + "sampling/sampling_logp_difference/mean": 0.01840684749186039, + "step": 473 + }, + { + "clip_ratio/high_max": 1.2232871313244686e-05, + "clip_ratio/high_mean": 3.0582178283111716e-06, + "clip_ratio/low_mean": 3.636896872194484e-05, + "clip_ratio/low_min": 3.1460788250115e-06, + "clip_ratio/region_mean": 3.9427186266038916e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16254.0, + "completions/mean_length": 9042.90625, + "completions/mean_terminated_length": 8283.482421875, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "entropy": 0.9306210279464722, + "epoch": 0.43606255749770007, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034676652867347, + "learning_rate": 1e-05, + "loss": 0.0504, + "num_tokens": 417951311.0, + "reward": 0.265625, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999234080314636, + "sampling/importance_sampling_ratio/min": 0.0002641192404553294, + "sampling/sampling_logp_difference/max": 8.239109992980957, + "sampling/sampling_logp_difference/mean": 0.02112819254398346, + "step": 474 + }, + { + "clip_ratio/high_max": 2.5187824576278217e-05, + "clip_ratio/high_mean": 8.202394610634656e-06, + "clip_ratio/low_mean": 4.3606626604741905e-05, + "clip_ratio/low_min": 3.5752079838857753e-06, + "clip_ratio/region_mean": 5.1809020988002885e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15721.0, + "completions/mean_length": 6763.6328125, + "completions/mean_terminated_length": 6610.9287109375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9879302233457565, + "epoch": 0.43698252069917204, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030218157917261124, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 418836184.0, + "reward": 0.484375, + "reward_std": 0.30091896653175354, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 0.0003778560785576701, + "sampling/sampling_logp_difference/max": 7.880997180938721, + "sampling/sampling_logp_difference/mean": 0.021101050078868866, + "step": 475 + }, + { + "clip_ratio/high_max": 1.0644185749697499e-05, + "clip_ratio/high_mean": 2.6610464374243747e-06, + "clip_ratio/low_mean": 6.21261324340594e-05, + "clip_ratio/low_min": 3.6509140954876784e-06, + "clip_ratio/region_mean": 6.478717887148377e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15675.0, + "completions/mean_length": 6794.25, + "completions/mean_terminated_length": 6564.09619140625, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 1.0259138569235802, + "epoch": 0.43790248390064396, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002881827764213085, + "learning_rate": 1e-05, + "loss": 0.0592, + "num_tokens": 419726192.0, + "reward": 0.265625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999275207519531, + "sampling/importance_sampling_ratio/min": 9.217044407705544e-07, + "sampling/sampling_logp_difference/max": 13.897041320800781, + "sampling/sampling_logp_difference/mean": 0.0210823193192482, + "step": 476 + }, + { + "clip_ratio/high_max": 1.108860487875063e-05, + "clip_ratio/high_mean": 2.7721512196876574e-06, + "clip_ratio/low_mean": 4.70996876629215e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9871839337356505e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14281.0, + "completions/max_terminated_length": 14281.0, + "completions/mean_length": 5648.2109375, + "completions/mean_terminated_length": 5648.2109375, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.88894472271204, + "epoch": 0.43882244710211593, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00289533962495625, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 420468867.0, + "reward": 0.484375, + "reward_std": 0.2675113081932068, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998449087142944, + "sampling/importance_sampling_ratio/min": 0.001372925122268498, + "sampling/sampling_logp_difference/max": 6.590811729431152, + "sampling/sampling_logp_difference/mean": 0.018499158322811127, + "step": 477 + }, + { + "clip_ratio/high_max": 4.753574557980755e-06, + "clip_ratio/high_mean": 1.1883936394951888e-06, + "clip_ratio/low_mean": 2.4103785335682915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5292179316238617e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15657.0, + "completions/mean_length": 6188.359375, + "completions/mean_terminated_length": 6026.52392578125, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "entropy": 0.8476063013076782, + "epoch": 0.43974241030358785, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002749695209786296, + "learning_rate": 1e-05, + "loss": 0.0012, + "num_tokens": 421280881.0, + "reward": 0.3671875, + "reward_std": 0.15991678833961487, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796152114868, + "sampling/importance_sampling_ratio/min": 0.004578418098390102, + "sampling/sampling_logp_difference/max": 5.386401653289795, + "sampling/sampling_logp_difference/mean": 0.018456483259797096, + "step": 478 + }, + { + "clip_ratio/high_max": 4.1359915030625416e-05, + "clip_ratio/high_mean": 1.0339978757656354e-05, + "clip_ratio/low_mean": 4.786080125995795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8200780586048495e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 6864.3515625, + "completions/mean_terminated_length": 6635.88037109375, + "completions/min_length": 1065.0, + "completions/min_terminated_length": 1065.0, + "entropy": 0.8666203916072845, + "epoch": 0.4406623735050598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.005116373300552368, + "learning_rate": 1e-05, + "loss": 0.0347, + "num_tokens": 422177822.0, + "reward": 0.4453125, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 0.00020385721290949732, + "sampling/sampling_logp_difference/max": 8.498090744018555, + "sampling/sampling_logp_difference/mean": 0.01979806460440159, + "step": 479 + }, + { + "clip_ratio/high_max": 1.4544774558089557e-05, + "clip_ratio/high_mean": 3.6361936395223893e-06, + "clip_ratio/low_mean": 4.153812756158004e-05, + "clip_ratio/low_min": 3.606462769312202e-06, + "clip_ratio/region_mean": 4.51743208031985e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 7023.828125, + "completions/mean_terminated_length": 6799.18408203125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9098334684967995, + "epoch": 0.44158233670653174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0020944855641573668, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 423096576.0, + "reward": 0.2734375, + "reward_std": 0.20858672261238098, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999480247497559, + "sampling/importance_sampling_ratio/min": 0.0027383591514080763, + "sampling/sampling_logp_difference/max": 5.900396347045898, + "sampling/sampling_logp_difference/mean": 0.020111342892050743, + "step": 480 + }, + { + "clip_ratio/high_max": 3.256236095694476e-05, + "clip_ratio/high_mean": 1.2372795026749372e-05, + "clip_ratio/low_mean": 5.0774355258909054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.314715119515313e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15527.0, + "completions/mean_length": 6666.828125, + "completions/mean_terminated_length": 6512.587890625, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "entropy": 0.9162466824054718, + "epoch": 0.44250229990800366, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003897767048329115, + "learning_rate": 1e-05, + "loss": 0.1151, + "num_tokens": 423968050.0, + "reward": 0.46875, + "reward_std": 0.3527044653892517, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0031828521750867367, + "sampling/sampling_logp_difference/max": 5.7499775886535645, + "sampling/sampling_logp_difference/mean": 0.019923247396945953, + "step": 481 + }, + { + "clip_ratio/high_max": 1.5341902098953142e-05, + "clip_ratio/high_mean": 4.791600815678976e-06, + "clip_ratio/low_mean": 7.980174223121139e-05, + "clip_ratio/low_min": 2.6713308216130827e-05, + "clip_ratio/region_mean": 8.459334412691533e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16223.0, + "completions/mean_length": 7159.8046875, + "completions/mean_terminated_length": 7013.38916015625, + "completions/min_length": 1022.0, + "completions/min_terminated_length": 1022.0, + "entropy": 0.8444746807217598, + "epoch": 0.44342226310947563, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003038195427507162, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 424902953.0, + "reward": 0.359375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940037727356, + "sampling/importance_sampling_ratio/min": 7.431909580191132e-06, + "sampling/sampling_logp_difference/max": 11.809727668762207, + "sampling/sampling_logp_difference/mean": 0.019014043733477592, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.55851120666739e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.55851120666739e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14716.0, + "completions/mean_length": 6146.2109375, + "completions/mean_terminated_length": 6065.5986328125, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.8365580290555954, + "epoch": 0.44434222631094755, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025550283025950193, + "learning_rate": 1e-05, + "loss": 0.0548, + "num_tokens": 425709212.0, + "reward": 0.5625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000015497207642, + "sampling/importance_sampling_ratio/min": 0.0006884043687023222, + "sampling/sampling_logp_difference/max": 7.281134128570557, + "sampling/sampling_logp_difference/mean": 0.019193854182958603, + "step": 483 + }, + { + "clip_ratio/high_max": 2.4752349872869672e-05, + "clip_ratio/high_mean": 7.036488455014478e-06, + "clip_ratio/low_mean": 4.780410063176532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.484058920046664e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16153.0, + "completions/mean_length": 6557.578125, + "completions/mean_terminated_length": 6321.744140625, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.8316832035779953, + "epoch": 0.4452621895124195, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005126865580677986, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 426566462.0, + "reward": 0.484375, + "reward_std": 0.27852246165275574, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 2.7536634661373682e-05, + "sampling/sampling_logp_difference/max": 10.499993324279785, + "sampling/sampling_logp_difference/mean": 0.01839536987245083, + "step": 484 + }, + { + "clip_ratio/high_max": 3.443571449679439e-05, + "clip_ratio/high_mean": 8.608928624198597e-06, + "clip_ratio/low_mean": 5.915772453590762e-05, + "clip_ratio/low_min": 1.7084812043322017e-05, + "clip_ratio/region_mean": 6.776665304641938e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16359.0, + "completions/mean_length": 7007.3203125, + "completions/mean_terminated_length": 6858.484375, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8674142584204674, + "epoch": 0.44618215271389144, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004829525947570801, + "learning_rate": 1e-05, + "loss": 0.0753, + "num_tokens": 427480007.0, + "reward": 0.46875, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998922944068909, + "sampling/importance_sampling_ratio/min": 0.00020170137577224523, + "sampling/sampling_logp_difference/max": 8.508722305297852, + "sampling/sampling_logp_difference/mean": 0.019586069509387016, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.539863354897534e-05, + "clip_ratio/low_min": 8.211341992137022e-06, + "clip_ratio/region_mean": 5.539863354897534e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14748.0, + "completions/mean_length": 7069.8828125, + "completions/mean_terminated_length": 6922.0400390625, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.9066255167126656, + "epoch": 0.44710211591536336, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003539952216669917, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 428404968.0, + "reward": 0.5, + "reward_std": 0.3618982434272766, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 0.00024052867956925184, + "sampling/sampling_logp_difference/max": 8.332671165466309, + "sampling/sampling_logp_difference/mean": 0.020427238196134567, + "step": 486 + }, + { + "clip_ratio/high_max": 1.6550495729461545e-05, + "clip_ratio/high_mean": 4.137623932365386e-06, + "clip_ratio/low_mean": 5.576918465521885e-05, + "clip_ratio/low_min": 1.2613936178240692e-05, + "clip_ratio/region_mean": 5.99068093833921e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15290.0, + "completions/max_terminated_length": 15290.0, + "completions/mean_length": 5586.6875, + "completions/mean_terminated_length": 5586.6875, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.9208655655384064, + "epoch": 0.44802207911683534, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0030504625756293535, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 429137176.0, + "reward": 0.515625, + "reward_std": 0.3480040729045868, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999984502792358, + "sampling/importance_sampling_ratio/min": 0.0005498559912666678, + "sampling/sampling_logp_difference/max": 7.50585412979126, + "sampling/sampling_logp_difference/mean": 0.019396595656871796, + "step": 487 + }, + { + "clip_ratio/high_max": 3.3761509712348925e-05, + "clip_ratio/high_mean": 8.440377428087231e-06, + "clip_ratio/low_mean": 3.6384140912559815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.482451868170756e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15404.0, + "completions/mean_length": 5266.265625, + "completions/mean_terminated_length": 4999.4404296875, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.7884859293699265, + "epoch": 0.44894204231830726, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003902251599356532, + "learning_rate": 1e-05, + "loss": -0.0077, + "num_tokens": 429836026.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.05675617232918739, + "sampling/sampling_logp_difference/max": 2.868990898132324, + "sampling/sampling_logp_difference/mean": 0.01770034246146679, + "step": 488 + }, + { + "clip_ratio/high_max": 2.2323702978610527e-05, + "clip_ratio/high_mean": 5.580925744652632e-06, + "clip_ratio/low_mean": 4.0199149452746497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.578007497002545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6398.53125, + "completions/mean_terminated_length": 6319.9052734375, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "entropy": 0.8982341960072517, + "epoch": 0.44986200551977923, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024998660665005445, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 430673446.0, + "reward": 0.421875, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797940254211, + "sampling/importance_sampling_ratio/min": 0.000612784584518522, + "sampling/sampling_logp_difference/max": 7.397497177124023, + "sampling/sampling_logp_difference/mean": 0.020521972328424454, + "step": 489 + }, + { + "clip_ratio/high_max": 3.1756624366607866e-05, + "clip_ratio/high_mean": 7.939156091651967e-06, + "clip_ratio/low_mean": 8.124458963720826e-05, + "clip_ratio/low_min": 1.2379174222587608e-05, + "clip_ratio/region_mean": 8.91837471499457e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14374.0, + "completions/mean_length": 6277.65625, + "completions/mean_terminated_length": 6198.07861328125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8139145970344543, + "epoch": 0.45078196872125115, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00784115307033062, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 431497546.0, + "reward": 0.546875, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999848484992981, + "sampling/importance_sampling_ratio/min": 0.0006267798598855734, + "sampling/sampling_logp_difference/max": 7.37491512298584, + "sampling/sampling_logp_difference/mean": 0.01836184598505497, + "step": 490 + }, + { + "clip_ratio/high_max": 8.875004823494237e-06, + "clip_ratio/high_mean": 2.2187512058735592e-06, + "clip_ratio/low_mean": 2.3825880248296016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6044631454169576e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15903.0, + "completions/mean_length": 7708.59375, + "completions/mean_terminated_length": 7355.9345703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.087083138525486, + "epoch": 0.45170193192272307, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004277343396097422, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 432503414.0, + "reward": 0.2890625, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999503493309021, + "sampling/importance_sampling_ratio/min": 1.2187546417408157e-05, + "sampling/sampling_logp_difference/max": 11.315095901489258, + "sampling/sampling_logp_difference/mean": 0.02224145457148552, + "step": 491 + }, + { + "clip_ratio/high_max": 6.384065272868611e-06, + "clip_ratio/high_mean": 1.5960163182171527e-06, + "clip_ratio/low_mean": 3.561227788395627e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.720829374742607e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7162.7109375, + "completions/mean_terminated_length": 6865.25, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.9157010763883591, + "epoch": 0.45262189512419504, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006278311368077993, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 433439137.0, + "reward": 0.5078125, + "reward_std": 0.2227931171655655, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966561794281, + "sampling/importance_sampling_ratio/min": 0.0005532125360332429, + "sampling/sampling_logp_difference/max": 7.499768257141113, + "sampling/sampling_logp_difference/mean": 0.02123419940471649, + "step": 492 + }, + { + "clip_ratio/high_max": 2.846911434062349e-05, + "clip_ratio/high_mean": 8.656040449750435e-06, + "clip_ratio/low_mean": 5.1716241614485625e-05, + "clip_ratio/low_min": 3.601579010137357e-06, + "clip_ratio/region_mean": 6.037228104105452e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16123.0, + "completions/mean_length": 7388.90625, + "completions/mean_terminated_length": 7023.251953125, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "entropy": 0.7670486867427826, + "epoch": 0.45354185832566696, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005177734419703484, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 434402045.0, + "reward": 0.3828125, + "reward_std": 0.37951958179473877, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999250769615173, + "sampling/importance_sampling_ratio/min": 0.0022511729039251804, + "sampling/sampling_logp_difference/max": 6.096303939819336, + "sampling/sampling_logp_difference/mean": 0.01827731542289257, + "step": 493 + }, + { + "clip_ratio/high_max": 2.1548471977439476e-05, + "clip_ratio/high_mean": 6.257203722270788e-06, + "clip_ratio/low_mean": 7.719641234871233e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.345361538886209e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 6805.375, + "completions/mean_terminated_length": 6496.38671875, + "completions/min_length": 587.0, + "completions/min_terminated_length": 587.0, + "entropy": 0.8407405763864517, + "epoch": 0.45446182152713893, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032320048194378614, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 435292029.0, + "reward": 0.4296875, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999642372131348, + "sampling/importance_sampling_ratio/min": 6.679954094579443e-05, + "sampling/sampling_logp_difference/max": 9.613814353942871, + "sampling/sampling_logp_difference/mean": 0.018761277198791504, + "step": 494 + }, + { + "clip_ratio/high_max": 3.460495008766884e-06, + "clip_ratio/high_mean": 8.65123752191721e-07, + "clip_ratio/low_mean": 7.76378024056612e-05, + "clip_ratio/low_min": 1.7026316072588088e-05, + "clip_ratio/region_mean": 7.850292649891344e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15105.0, + "completions/mean_length": 5753.4140625, + "completions/mean_terminated_length": 5321.2763671875, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "entropy": 0.7848984077572823, + "epoch": 0.45538178472861085, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030854379292577505, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 436046842.0, + "reward": 0.578125, + "reward_std": 0.31405961513519287, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998626708984375, + "sampling/importance_sampling_ratio/min": 4.36544311810394e-09, + "sampling/sampling_logp_difference/max": 19.24954605102539, + "sampling/sampling_logp_difference/mean": 0.017733070999383926, + "step": 495 + }, + { + "clip_ratio/high_max": 1.7207588371093152e-05, + "clip_ratio/high_mean": 4.301897092773288e-06, + "clip_ratio/low_mean": 3.234025916754035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.664215591925313e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6522.84375, + "completions/mean_terminated_length": 6445.19677734375, + "completions/min_length": 1062.0, + "completions/min_terminated_length": 1062.0, + "entropy": 1.0593653172254562, + "epoch": 0.4563017479300828, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003124243812635541, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 436899638.0, + "reward": 0.4140625, + "reward_std": 0.2706219553947449, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999418258666992, + "sampling/importance_sampling_ratio/min": 4.476920821616659e-06, + "sampling/sampling_logp_difference/max": 12.316575050354004, + "sampling/sampling_logp_difference/mean": 0.021180003881454468, + "step": 496 + }, + { + "clip_ratio/high_max": 1.1790433973146719e-05, + "clip_ratio/high_mean": 2.9476084932866797e-06, + "clip_ratio/low_mean": 2.8437304308681632e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.138491274512489e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14515.0, + "completions/mean_length": 6203.203125, + "completions/mean_terminated_length": 5874.7900390625, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.8152795508503914, + "epoch": 0.45722171113155474, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005001795012503862, + "learning_rate": 1e-05, + "loss": 0.0817, + "num_tokens": 437713008.0, + "reward": 0.4296875, + "reward_std": 0.26143795251846313, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101758003235, + "sampling/importance_sampling_ratio/min": 0.001757707679644227, + "sampling/sampling_logp_difference/max": 6.34374475479126, + "sampling/sampling_logp_difference/mean": 0.017751028761267662, + "step": 497 + }, + { + "clip_ratio/high_max": 1.3163793028070359e-05, + "clip_ratio/high_mean": 4.229499381835922e-06, + "clip_ratio/low_mean": 4.4599403963729856e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.882890357293945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15423.0, + "completions/mean_length": 5975.5234375, + "completions/mean_terminated_length": 5725.72021484375, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "entropy": 0.8275932744145393, + "epoch": 0.45814167433302666, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005084732081741095, + "learning_rate": 1e-05, + "loss": 0.0759, + "num_tokens": 438495811.0, + "reward": 0.5390625, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998699426651001, + "sampling/importance_sampling_ratio/min": 3.120788460364565e-05, + "sampling/sampling_logp_difference/max": 10.374839782714844, + "sampling/sampling_logp_difference/mean": 0.018671832978725433, + "step": 498 + }, + { + "clip_ratio/high_max": 3.229640242352616e-06, + "clip_ratio/high_mean": 8.07410060588154e-07, + "clip_ratio/low_mean": 3.0413870263146237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1221280551108066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 7019.59375, + "completions/mean_terminated_length": 7019.59375, + "completions/min_length": 1058.0, + "completions/min_terminated_length": 1058.0, + "entropy": 0.9266618490219116, + "epoch": 0.45906163753449863, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002567912917584181, + "learning_rate": 1e-05, + "loss": 0.0282, + "num_tokens": 439413055.0, + "reward": 0.375, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000476837158203, + "sampling/importance_sampling_ratio/min": 0.0010315657127648592, + "sampling/sampling_logp_difference/max": 6.876677513122559, + "sampling/sampling_logp_difference/mean": 0.02012534812092781, + "step": 499 + }, + { + "clip_ratio/high_max": 1.8327779343962902e-05, + "clip_ratio/high_mean": 4.5819448359907256e-06, + "clip_ratio/low_mean": 4.08189575864526e-05, + "clip_ratio/low_min": 4.041122338094283e-06, + "clip_ratio/region_mean": 4.5400901854009135e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7373.3203125, + "completions/mean_terminated_length": 7082.65283203125, + "completions/min_length": 854.0, + "completions/min_terminated_length": 854.0, + "entropy": 0.9383682310581207, + "epoch": 0.45998160073597055, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004862098954617977, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 440375128.0, + "reward": 0.4375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.0006883886526338756, + "sampling/sampling_logp_difference/max": 7.28115701675415, + "sampling/sampling_logp_difference/mean": 0.020596595481038094, + "step": 500 + }, + { + "clip_ratio/high_max": 1.650619151405408e-05, + "clip_ratio/high_mean": 4.12654787851352e-06, + "clip_ratio/low_mean": 6.364750265674957e-05, + "clip_ratio/low_min": 3.94595599573222e-06, + "clip_ratio/region_mean": 6.77740499668289e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16280.0, + "completions/mean_length": 5944.953125, + "completions/mean_terminated_length": 5862.755859375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "entropy": 0.9130716845393181, + "epoch": 0.4609015639374425, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041388699784875, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 441156306.0, + "reward": 0.3984375, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999566078186035, + "sampling/importance_sampling_ratio/min": 0.0007685241289436817, + "sampling/sampling_logp_difference/max": 7.171038627624512, + "sampling/sampling_logp_difference/mean": 0.019817989319562912, + "step": 501 + }, + { + "clip_ratio/high_max": 2.9951792839710834e-05, + "clip_ratio/high_mean": 9.205811807078135e-06, + "clip_ratio/low_mean": 3.147234815514821e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0678160075913183e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16181.0, + "completions/mean_length": 6686.015625, + "completions/mean_terminated_length": 6609.6533203125, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 0.8640913739800453, + "epoch": 0.46182152713891444, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005679543130099773, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 442032972.0, + "reward": 0.5546875, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 0.007731473073363304, + "sampling/sampling_logp_difference/max": 4.86245584487915, + "sampling/sampling_logp_difference/mean": 0.019738182425498962, + "step": 502 + }, + { + "clip_ratio/high_max": 3.0190597726686974e-05, + "clip_ratio/high_mean": 7.5476494316717435e-06, + "clip_ratio/low_mean": 3.858067566397949e-05, + "clip_ratio/low_min": 9.290916750614997e-06, + "clip_ratio/region_mean": 4.612832617567619e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 6945.5, + "completions/mean_terminated_length": 6231.6640625, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.8156519457697868, + "epoch": 0.46274149034038636, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006176612339913845, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 442940940.0, + "reward": 0.46875, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999117851257324, + "sampling/importance_sampling_ratio/min": 0.00018278000061400235, + "sampling/sampling_logp_difference/max": 8.607227325439453, + "sampling/sampling_logp_difference/mean": 0.01836501806974411, + "step": 503 + }, + { + "clip_ratio/high_max": 2.2105000425653998e-05, + "clip_ratio/high_mean": 6.28071654773521e-06, + "clip_ratio/low_mean": 3.060894187001395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6889658531436e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15847.0, + "completions/mean_length": 8068.5390625, + "completions/mean_terminated_length": 7363.8388671875, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "entropy": 0.8196670189499855, + "epoch": 0.46366145354185834, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021770994644612074, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 443992041.0, + "reward": 0.4453125, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999759197235107, + "sampling/importance_sampling_ratio/min": 0.0001795605494407937, + "sampling/sampling_logp_difference/max": 8.624998092651367, + "sampling/sampling_logp_difference/mean": 0.019003838300704956, + "step": 504 + }, + { + "clip_ratio/high_max": 1.287241002501105e-05, + "clip_ratio/high_mean": 3.2181025062527624e-06, + "clip_ratio/low_mean": 4.5685408849749365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.89035115833758e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15168.0, + "completions/mean_length": 5209.140625, + "completions/mean_terminated_length": 5031.76220703125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.8851845487952232, + "epoch": 0.46458141674333026, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00788798462599516, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 444679675.0, + "reward": 0.4609375, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796748161316, + "sampling/importance_sampling_ratio/min": 0.00025673024356365204, + "sampling/sampling_logp_difference/max": 8.267484664916992, + "sampling/sampling_logp_difference/mean": 0.018808994442224503, + "step": 505 + }, + { + "clip_ratio/high_max": 2.294301202709903e-05, + "clip_ratio/high_mean": 6.590465602585027e-06, + "clip_ratio/low_mean": 5.944662643742049e-05, + "clip_ratio/low_min": 8.106994755507912e-06, + "clip_ratio/region_mean": 6.603709243790945e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16259.0, + "completions/mean_length": 7558.8984375, + "completions/mean_terminated_length": 7274.21728515625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.003449946641922, + "epoch": 0.46550137994480223, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004547314252704382, + "learning_rate": 1e-05, + "loss": 0.1586, + "num_tokens": 445668126.0, + "reward": 0.421875, + "reward_std": 0.42293959856033325, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999848484992981, + "sampling/importance_sampling_ratio/min": 0.00011622780584730208, + "sampling/sampling_logp_difference/max": 9.059958457946777, + "sampling/sampling_logp_difference/mean": 0.02099413052201271, + "step": 506 + }, + { + "clip_ratio/high_max": 2.1350435872591333e-05, + "clip_ratio/high_mean": 6.047981628398702e-06, + "clip_ratio/low_mean": 8.880347786544007e-05, + "clip_ratio/low_min": 9.06585455595632e-06, + "clip_ratio/region_mean": 9.485145938015194e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16137.0, + "completions/max_terminated_length": 16137.0, + "completions/mean_length": 6066.6015625, + "completions/mean_terminated_length": 6066.6015625, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "entropy": 0.8450648710131645, + "epoch": 0.46642134314627415, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004621773958206177, + "learning_rate": 1e-05, + "loss": 0.121, + "num_tokens": 446464587.0, + "reward": 0.5390625, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000154972076416, + "sampling/importance_sampling_ratio/min": 1.3950601896794979e-05, + "sampling/sampling_logp_difference/max": 11.179987907409668, + "sampling/sampling_logp_difference/mean": 0.018016980960965157, + "step": 507 + }, + { + "clip_ratio/high_max": 3.0534724828612525e-06, + "clip_ratio/high_mean": 7.633681207153131e-07, + "clip_ratio/low_mean": 2.149350007130124e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2256868305703392e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 6988.0234375, + "completions/mean_terminated_length": 6838.88134765625, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 1.0452716201543808, + "epoch": 0.46734130634774607, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004523546434938908, + "learning_rate": 1e-05, + "loss": 0.0396, + "num_tokens": 447381134.0, + "reward": 0.3515625, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999901056289673, + "sampling/importance_sampling_ratio/min": 0.016167031601071358, + "sampling/sampling_logp_difference/max": 4.124781131744385, + "sampling/sampling_logp_difference/mean": 0.021812722086906433, + "step": 508 + }, + { + "clip_ratio/high_max": 5.58759120394825e-06, + "clip_ratio/high_mean": 1.3968978009870625e-06, + "clip_ratio/low_mean": 3.684896307731833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.824586099199223e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12316.0, + "completions/max_terminated_length": 12316.0, + "completions/mean_length": 5948.5, + "completions/mean_terminated_length": 5948.5, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8241566568613052, + "epoch": 0.46826126954921804, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004002885892987251, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 448158014.0, + "reward": 0.5703125, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 0.0008566387114115059, + "sampling/sampling_logp_difference/max": 7.062494277954102, + "sampling/sampling_logp_difference/mean": 0.018487900495529175, + "step": 509 + }, + { + "clip_ratio/high_max": 1.0490723752809572e-05, + "clip_ratio/high_mean": 3.439610338773491e-06, + "clip_ratio/low_mean": 3.973086239739132e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3170473020381905e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16044.0, + "completions/mean_length": 7966.375, + "completions/mean_terminated_length": 7764.3525390625, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.8868448063731194, + "epoch": 0.46918123275068996, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019062751671299338, + "learning_rate": 1e-05, + "loss": 0.0787, + "num_tokens": 449197054.0, + "reward": 0.40625, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0001614262000657618, + "sampling/sampling_logp_difference/max": 8.731462478637695, + "sampling/sampling_logp_difference/mean": 0.020015282556414604, + "step": 510 + }, + { + "clip_ratio/high_max": 1.2195105682621943e-05, + "clip_ratio/high_mean": 3.0487764206554857e-06, + "clip_ratio/low_mean": 3.558348203114292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8632259474979946e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 6520.0234375, + "completions/mean_terminated_length": 6442.3544921875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9168323278427124, + "epoch": 0.47010119595216193, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00490277074277401, + "learning_rate": 1e-05, + "loss": 0.0547, + "num_tokens": 450050153.0, + "reward": 0.484375, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 4.4418397919798736e-06, + "sampling/sampling_logp_difference/max": 12.324441909790039, + "sampling/sampling_logp_difference/mean": 0.020178331062197685, + "step": 511 + }, + { + "clip_ratio/high_max": 7.95772848505294e-06, + "clip_ratio/high_mean": 1.989432121263235e-06, + "clip_ratio/low_mean": 3.363800146871654e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.562743381735345e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 6614.5625, + "completions/mean_terminated_length": 6217.4306640625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.8635925352573395, + "epoch": 0.47102115915363385, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003792276605963707, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 450915281.0, + "reward": 0.5, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999154806137085, + "sampling/importance_sampling_ratio/min": 0.004489119164645672, + "sampling/sampling_logp_difference/max": 5.40609884262085, + "sampling/sampling_logp_difference/mean": 0.019233014434576035, + "step": 512 + }, + { + "clip_ratio/high_max": 1.6306271390931215e-05, + "clip_ratio/high_mean": 6.67555605105008e-06, + "clip_ratio/low_mean": 3.4846169796765025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1521726302562456e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 6458.5078125, + "completions/mean_terminated_length": 5970.36865234375, + "completions/min_length": 1025.0, + "completions/min_terminated_length": 1025.0, + "entropy": 0.8816124573349953, + "epoch": 0.47194112235510577, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031763892620801926, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 451761322.0, + "reward": 0.4921875, + "reward_std": 0.282474160194397, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999036192893982, + "sampling/importance_sampling_ratio/min": 9.611394489184022e-05, + "sampling/sampling_logp_difference/max": 9.24997615814209, + "sampling/sampling_logp_difference/mean": 0.01935420371592045, + "step": 513 + }, + { + "clip_ratio/high_max": 7.861634912842419e-06, + "clip_ratio/high_mean": 3.0314158721012063e-06, + "clip_ratio/low_mean": 2.2518463538290234e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.554987941039144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 5844.03125, + "completions/mean_terminated_length": 5676.73046875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.9008020162582397, + "epoch": 0.47286108555657774, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004134794697165489, + "learning_rate": 1e-05, + "loss": 0.1094, + "num_tokens": 452526342.0, + "reward": 0.546875, + "reward_std": 0.28930899500846863, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999297857284546, + "sampling/importance_sampling_ratio/min": 0.00012955136480741203, + "sampling/sampling_logp_difference/max": 8.951433181762695, + "sampling/sampling_logp_difference/mean": 0.02013866975903511, + "step": 514 + }, + { + "clip_ratio/high_max": 1.2711160707112867e-05, + "clip_ratio/high_mean": 3.177790176778217e-06, + "clip_ratio/low_mean": 2.444096298859222e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.761875293799676e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 6214.5859375, + "completions/mean_terminated_length": 6134.51171875, + "completions/min_length": 1096.0, + "completions/min_terminated_length": 1096.0, + "entropy": 0.9522949978709221, + "epoch": 0.47378104875804966, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022520655766129494, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 453343385.0, + "reward": 0.4921875, + "reward_std": 0.20623260736465454, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999879598617554, + "sampling/importance_sampling_ratio/min": 3.763851054827683e-05, + "sampling/sampling_logp_difference/max": 10.187482833862305, + "sampling/sampling_logp_difference/mean": 0.019947605207562447, + "step": 515 + }, + { + "clip_ratio/high_max": 5.724247012039996e-05, + "clip_ratio/high_mean": 1.431061753009999e-05, + "clip_ratio/low_mean": 3.371703428456385e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8027652155724354e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14376.0, + "completions/mean_length": 7138.515625, + "completions/mean_terminated_length": 7065.71630859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.8856206461787224, + "epoch": 0.47470101195952163, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004887089133262634, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 454275379.0, + "reward": 0.4609375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999544620513916, + "sampling/importance_sampling_ratio/min": 0.004931141622364521, + "sampling/sampling_logp_difference/max": 5.312184810638428, + "sampling/sampling_logp_difference/mean": 0.019449077546596527, + "step": 516 + }, + { + "clip_ratio/high_max": 1.5607688055752078e-05, + "clip_ratio/high_mean": 3.9019220139380195e-06, + "clip_ratio/low_mean": 4.936055870530254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.326248106030107e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15855.0, + "completions/mean_length": 6077.796875, + "completions/mean_terminated_length": 5915.00830078125, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.862022191286087, + "epoch": 0.47562097516099355, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003875041613355279, + "learning_rate": 1e-05, + "loss": 0.0366, + "num_tokens": 455076625.0, + "reward": 0.4921875, + "reward_std": 0.23933593928813934, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000392198562622, + "sampling/importance_sampling_ratio/min": 3.322543852846138e-05, + "sampling/sampling_logp_difference/max": 10.31219482421875, + "sampling/sampling_logp_difference/mean": 0.018907926976680756, + "step": 517 + }, + { + "clip_ratio/high_max": 1.0557040241110371e-05, + "clip_ratio/high_mean": 3.535163386914064e-06, + "clip_ratio/low_mean": 3.7409978290270374e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0945141790871276e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 6211.65625, + "completions/mean_terminated_length": 6211.65625, + "completions/min_length": 1292.0, + "completions/min_terminated_length": 1292.0, + "entropy": 0.8835236355662346, + "epoch": 0.4765409383624655, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004288897849619389, + "learning_rate": 1e-05, + "loss": 0.0822, + "num_tokens": 455889693.0, + "reward": 0.53125, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999270439147949, + "sampling/importance_sampling_ratio/min": 2.5614745027269237e-06, + "sampling/sampling_logp_difference/max": 12.874927520751953, + "sampling/sampling_logp_difference/mean": 0.01986120268702507, + "step": 518 + }, + { + "clip_ratio/high_max": 2.842265530489385e-06, + "clip_ratio/high_mean": 7.105663826223463e-07, + "clip_ratio/low_mean": 3.578249538804812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.649306199804414e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 7035.609375, + "completions/mean_terminated_length": 6962.0, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "entropy": 0.9033957049250603, + "epoch": 0.47746090156393745, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004230308346450329, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 456809643.0, + "reward": 0.3203125, + "reward_std": 0.17282497882843018, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999722242355347, + "sampling/importance_sampling_ratio/min": 1.670435995038133e-05, + "sampling/sampling_logp_difference/max": 10.99984073638916, + "sampling/sampling_logp_difference/mean": 0.020262110978364944, + "step": 519 + }, + { + "clip_ratio/high_max": 3.539844283295679e-05, + "clip_ratio/high_mean": 9.844010264714598e-06, + "clip_ratio/low_mean": 2.8534720058814855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.837873060774655e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16241.0, + "completions/mean_length": 6557.40625, + "completions/mean_terminated_length": 6321.568359375, + "completions/min_length": 1136.0, + "completions/min_terminated_length": 1136.0, + "entropy": 0.8352414071559906, + "epoch": 0.47838086476540936, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029154124204069376, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 457669431.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 5.8480534789850935e-05, + "sampling/sampling_logp_difference/max": 9.746816635131836, + "sampling/sampling_logp_difference/mean": 0.019474683329463005, + "step": 520 + }, + { + "clip_ratio/high_max": 6.400114170901361e-05, + "clip_ratio/high_mean": 1.917558859076962e-05, + "clip_ratio/low_mean": 5.166920755073079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.084479466357152e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15428.0, + "completions/mean_length": 6444.1328125, + "completions/mean_terminated_length": 6205.576171875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "entropy": 0.7480100840330124, + "epoch": 0.47930082796688134, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025195449125021696, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 458512648.0, + "reward": 0.515625, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999996542930603, + "sampling/importance_sampling_ratio/min": 2.4302940801135264e-05, + "sampling/sampling_logp_difference/max": 10.624913215637207, + "sampling/sampling_logp_difference/mean": 0.01779567077755928, + "step": 521 + }, + { + "clip_ratio/high_max": 2.748944325503544e-06, + "clip_ratio/high_mean": 6.87236081375886e-07, + "clip_ratio/low_mean": 3.4855478702411347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5542715181691165e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15868.0, + "completions/mean_length": 6615.234375, + "completions/mean_terminated_length": 6380.7841796875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.8428665772080421, + "epoch": 0.48022079116835326, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004339073318988085, + "learning_rate": 1e-05, + "loss": 0.0608, + "num_tokens": 459377790.0, + "reward": 0.5234375, + "reward_std": 0.31064465641975403, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999370574951172, + "sampling/importance_sampling_ratio/min": 0.00042492515058256686, + "sampling/sampling_logp_difference/max": 7.76359748840332, + "sampling/sampling_logp_difference/mean": 0.018815383315086365, + "step": 522 + }, + { + "clip_ratio/high_max": 2.2513844896820956e-05, + "clip_ratio/high_mean": 7.496596083456097e-06, + "clip_ratio/low_mean": 2.2591082483813807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0087678169365972e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15239.0, + "completions/mean_length": 6200.3203125, + "completions/mean_terminated_length": 5955.912109375, + "completions/min_length": 1032.0, + "completions/min_terminated_length": 1032.0, + "entropy": 0.9044734612107277, + "epoch": 0.48114075436982523, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005003004334867001, + "learning_rate": 1e-05, + "loss": 0.0502, + "num_tokens": 460189823.0, + "reward": 0.484375, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999645948410034, + "sampling/importance_sampling_ratio/min": 0.005019097588956356, + "sampling/sampling_logp_difference/max": 5.2945051193237305, + "sampling/sampling_logp_difference/mean": 0.0192951001226902, + "step": 523 + }, + { + "clip_ratio/high_max": 1.9086801785306307e-05, + "clip_ratio/high_mean": 4.771700446326577e-06, + "clip_ratio/low_mean": 3.145246773783583e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.622416772941506e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15706.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 5758.9140625, + "completions/mean_terminated_length": 5758.9140625, + "completions/min_length": 1181.0, + "completions/min_terminated_length": 1181.0, + "entropy": 0.8783154934644699, + "epoch": 0.48206071757129715, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005491400603204966, + "learning_rate": 1e-05, + "loss": 0.0209, + "num_tokens": 460944164.0, + "reward": 0.5859375, + "reward_std": 0.2330428510904312, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.003907227888703346, + "sampling/sampling_logp_difference/max": 5.54492712020874, + "sampling/sampling_logp_difference/mean": 0.019315458834171295, + "step": 524 + }, + { + "clip_ratio/high_max": 1.5554858691757545e-05, + "clip_ratio/high_mean": 3.888714672939386e-06, + "clip_ratio/low_mean": 9.616303373150004e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3505018273463065e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15536.0, + "completions/mean_length": 7573.375, + "completions/mean_terminated_length": 7504.0, + "completions/min_length": 1579.0, + "completions/min_terminated_length": 1579.0, + "entropy": 1.057753436267376, + "epoch": 0.48298068077276907, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0038622859865427017, + "learning_rate": 1e-05, + "loss": 0.0103, + "num_tokens": 461931916.0, + "reward": 0.3125, + "reward_std": 0.14123955368995667, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.002133321948349476, + "sampling/sampling_logp_difference/max": 6.1500749588012695, + "sampling/sampling_logp_difference/mean": 0.02145528793334961, + "step": 525 + }, + { + "clip_ratio/high_max": 2.2185531634022482e-05, + "clip_ratio/high_mean": 6.324094329102081e-06, + "clip_ratio/low_mean": 4.7102344296945375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342643908079481e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14553.0, + "completions/mean_length": 7353.0703125, + "completions/mean_terminated_length": 7136.328125, + "completions/min_length": 907.0, + "completions/min_terminated_length": 907.0, + "entropy": 0.9386680871248245, + "epoch": 0.48390064397424104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002902502194046974, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 462894701.0, + "reward": 0.5234375, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999150037765503, + "sampling/importance_sampling_ratio/min": 0.00492977537214756, + "sampling/sampling_logp_difference/max": 5.312461853027344, + "sampling/sampling_logp_difference/mean": 0.021296534687280655, + "step": 526 + }, + { + "clip_ratio/high_max": 1.8664793969946913e-05, + "clip_ratio/high_mean": 4.666198492486728e-06, + "clip_ratio/low_mean": 5.111583186589996e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.578203035838669e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15851.0, + "completions/mean_length": 7280.953125, + "completions/mean_terminated_length": 6987.30615234375, + "completions/min_length": 1111.0, + "completions/min_terminated_length": 1111.0, + "entropy": 0.9424067437648773, + "epoch": 0.48482060717571296, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002602500608190894, + "learning_rate": 1e-05, + "loss": 0.0546, + "num_tokens": 463849087.0, + "reward": 0.3125, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999302625656128, + "sampling/importance_sampling_ratio/min": 4.007156167062931e-05, + "sampling/sampling_logp_difference/max": 10.12484359741211, + "sampling/sampling_logp_difference/mean": 0.020630592480301857, + "step": 527 + }, + { + "clip_ratio/high_max": 3.77411461158772e-05, + "clip_ratio/high_mean": 1.0150766001970624e-05, + "clip_ratio/low_mean": 4.5688502041230095e-05, + "clip_ratio/low_min": 5.72383623875794e-06, + "clip_ratio/region_mean": 5.583926849794807e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14628.0, + "completions/max_terminated_length": 14628.0, + "completions/mean_length": 6520.6328125, + "completions/mean_terminated_length": 6520.6328125, + "completions/min_length": 1459.0, + "completions/min_terminated_length": 1459.0, + "entropy": 0.8501213267445564, + "epoch": 0.48574057037718493, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005743890535086393, + "learning_rate": 1e-05, + "loss": 0.1494, + "num_tokens": 464704336.0, + "reward": 0.3984375, + "reward_std": 0.3413938879966736, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999988079071045, + "sampling/importance_sampling_ratio/min": 5.838880315423012e-05, + "sampling/sampling_logp_difference/max": 9.74838638305664, + "sampling/sampling_logp_difference/mean": 0.018370801582932472, + "step": 528 + }, + { + "clip_ratio/high_max": 9.150254300038796e-06, + "clip_ratio/high_mean": 2.287563575009699e-06, + "clip_ratio/low_mean": 2.1804387529300584e-05, + "clip_ratio/low_min": 3.918126822100021e-06, + "clip_ratio/region_mean": 2.4091951559057634e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14675.0, + "completions/max_terminated_length": 14675.0, + "completions/mean_length": 7111.0, + "completions/mean_terminated_length": 7111.0, + "completions/min_length": 1288.0, + "completions/min_terminated_length": 1288.0, + "entropy": 0.8829544633626938, + "epoch": 0.48666053357865685, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004826955031603575, + "learning_rate": 1e-05, + "loss": 0.0967, + "num_tokens": 465632152.0, + "reward": 0.3984375, + "reward_std": 0.2975040376186371, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999524354934692, + "sampling/importance_sampling_ratio/min": 0.00011604782775975764, + "sampling/sampling_logp_difference/max": 9.061508178710938, + "sampling/sampling_logp_difference/mean": 0.019976403564214706, + "step": 529 + }, + { + "clip_ratio/high_max": 2.3185014015325578e-05, + "clip_ratio/high_mean": 7.603994390592561e-06, + "clip_ratio/low_mean": 4.392900382299558e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.153299889570917e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15132.0, + "completions/mean_length": 7797.7109375, + "completions/mean_terminated_length": 7448.67431640625, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.9747610911726952, + "epoch": 0.48758049678012877, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028944616205990314, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 466648507.0, + "reward": 0.390625, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999991774559021, + "sampling/importance_sampling_ratio/min": 0.0002612585376482457, + "sampling/sampling_logp_difference/max": 8.25, + "sampling/sampling_logp_difference/mean": 0.020830729976296425, + "step": 530 + }, + { + "clip_ratio/high_max": 1.4947459476388758e-05, + "clip_ratio/high_mean": 3.7368648690971895e-06, + "clip_ratio/low_mean": 4.282657914700394e-05, + "clip_ratio/low_min": 4.545454430626705e-06, + "clip_ratio/region_mean": 4.656344435716164e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6395.4765625, + "completions/mean_terminated_length": 6316.82666015625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.9015842452645302, + "epoch": 0.48850045998160074, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003612271510064602, + "learning_rate": 1e-05, + "loss": 0.0573, + "num_tokens": 467487976.0, + "reward": 0.4921875, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 1.209868287332938e-06, + "sampling/sampling_logp_difference/max": 13.624999046325684, + "sampling/sampling_logp_difference/mean": 0.01959329843521118, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.8946868863167765e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8946868863167765e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 7298.78125, + "completions/mean_terminated_length": 7154.57177734375, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9978953301906586, + "epoch": 0.48942042318307266, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002104024635627866, + "learning_rate": 1e-05, + "loss": 0.0104, + "num_tokens": 468445132.0, + "reward": 0.2890625, + "reward_std": 0.2301519513130188, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999783039093018, + "sampling/importance_sampling_ratio/min": 5.157754640094936e-05, + "sampling/sampling_logp_difference/max": 9.872424125671387, + "sampling/sampling_logp_difference/mean": 0.021517785266041756, + "step": 532 + }, + { + "clip_ratio/high_max": 2.0034196040796814e-05, + "clip_ratio/high_mean": 6.441706659643387e-06, + "clip_ratio/low_mean": 3.0451521752183908e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.689322829814046e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16003.0, + "completions/mean_length": 7021.53125, + "completions/mean_terminated_length": 6561.08154296875, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.9539581760764122, + "epoch": 0.49034038638454464, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0009346248698420823, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 469360760.0, + "reward": 0.375, + "reward_std": 0.20069600641727448, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.0029978419188410044, + "sampling/sampling_logp_difference/max": 5.8098626136779785, + "sampling/sampling_logp_difference/mean": 0.020538944751024246, + "step": 533 + }, + { + "clip_ratio/high_max": 7.874939228713629e-06, + "clip_ratio/high_mean": 1.968734807178407e-06, + "clip_ratio/low_mean": 3.2224923302237585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.419365827994625e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15370.0, + "completions/max_terminated_length": 15370.0, + "completions/mean_length": 6988.2109375, + "completions/mean_terminated_length": 6988.2109375, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.9471191540360451, + "epoch": 0.49126034958601655, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002331435214728117, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 470274859.0, + "reward": 0.3203125, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002145767212, + "sampling/importance_sampling_ratio/min": 0.0015642779180780053, + "sampling/sampling_logp_difference/max": 6.460330963134766, + "sampling/sampling_logp_difference/mean": 0.02088295854628086, + "step": 534 + }, + { + "clip_ratio/high_max": 1.2364610256554442e-05, + "clip_ratio/high_mean": 3.0911525641386106e-06, + "clip_ratio/low_mean": 3.8229277151913266e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.132042954552162e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16212.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 7557.453125, + "completions/mean_terminated_length": 7557.453125, + "completions/min_length": 1064.0, + "completions/min_terminated_length": 1064.0, + "entropy": 0.9897207245230675, + "epoch": 0.4921803127874885, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004562230780720711, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 471263997.0, + "reward": 0.4765625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 0.0001586318830959499, + "sampling/sampling_logp_difference/max": 8.748924255371094, + "sampling/sampling_logp_difference/mean": 0.02160259149968624, + "step": 535 + }, + { + "clip_ratio/high_max": 2.6050724500237266e-05, + "clip_ratio/high_mean": 7.420082738462952e-06, + "clip_ratio/low_mean": 5.8747830053107464e-05, + "clip_ratio/low_min": 1.3906133062846493e-05, + "clip_ratio/region_mean": 6.616791324631777e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15603.0, + "completions/mean_length": 6532.1953125, + "completions/mean_terminated_length": 6295.75244140625, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.9109068289399147, + "epoch": 0.49310027598896045, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004525062162429094, + "learning_rate": 1e-05, + "loss": 0.0219, + "num_tokens": 472120622.0, + "reward": 0.4296875, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999650120735168, + "sampling/importance_sampling_ratio/min": 1.474883083574241e-05, + "sampling/sampling_logp_difference/max": 11.124346733093262, + "sampling/sampling_logp_difference/mean": 0.019527796655893326, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.90738064766083e-05, + "clip_ratio/low_min": 1.0626089533616323e-05, + "clip_ratio/region_mean": 3.90738064766083e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15011.0, + "completions/mean_length": 5994.40625, + "completions/mean_terminated_length": 5912.5986328125, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "entropy": 0.9276224821805954, + "epoch": 0.49402023919043236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005058468785136938, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 472906346.0, + "reward": 0.421875, + "reward_std": 0.19044627249240875, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 0.0005196271813474596, + "sampling/sampling_logp_difference/max": 7.562398910522461, + "sampling/sampling_logp_difference/mean": 0.020568232983350754, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.992188062009518e-05, + "clip_ratio/low_min": 1.2131874427723233e-05, + "clip_ratio/region_mean": 5.992188062009518e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15992.0, + "completions/mean_length": 6469.046875, + "completions/mean_terminated_length": 6311.6669921875, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.9536962807178497, + "epoch": 0.49494020239190434, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007286665495485067, + "learning_rate": 1e-05, + "loss": 0.1282, + "num_tokens": 473756256.0, + "reward": 0.3515625, + "reward_std": 0.35772189497947693, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000038146972656, + "sampling/importance_sampling_ratio/min": 6.244324322324246e-05, + "sampling/sampling_logp_difference/max": 9.681252479553223, + "sampling/sampling_logp_difference/mean": 0.019624462351202965, + "step": 538 + }, + { + "clip_ratio/high_max": 1.0018506145570427e-05, + "clip_ratio/high_mean": 2.504626536392607e-06, + "clip_ratio/low_mean": 3.329443018174061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.57990563770727e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15383.0, + "completions/max_terminated_length": 15383.0, + "completions/mean_length": 5778.703125, + "completions/mean_terminated_length": 5778.703125, + "completions/min_length": 903.0, + "completions/min_terminated_length": 903.0, + "entropy": 0.9274095296859741, + "epoch": 0.49586016559337626, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031439310405403376, + "learning_rate": 1e-05, + "loss": -0.0091, + "num_tokens": 474515194.0, + "reward": 0.3828125, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000576972961426, + "sampling/importance_sampling_ratio/min": 0.0006267410353757441, + "sampling/sampling_logp_difference/max": 7.374977111816406, + "sampling/sampling_logp_difference/mean": 0.019796252250671387, + "step": 539 + }, + { + "clip_ratio/high_max": 3.1761268928676145e-05, + "clip_ratio/high_mean": 9.23904565297562e-06, + "clip_ratio/low_mean": 4.140612338687788e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.064516949460085e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16146.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 6400.75, + "completions/mean_terminated_length": 6400.75, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 0.8927748426795006, + "epoch": 0.49678012879484823, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0039032045751810074, + "learning_rate": 1e-05, + "loss": 0.0938, + "num_tokens": 475355186.0, + "reward": 0.5546875, + "reward_std": 0.3135277032852173, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880194664001, + "sampling/importance_sampling_ratio/min": 4.19893694925122e-06, + "sampling/sampling_logp_difference/max": 12.3806791305542, + "sampling/sampling_logp_difference/mean": 0.019878748804330826, + "step": 540 + }, + { + "clip_ratio/high_max": 2.524126966818585e-05, + "clip_ratio/high_mean": 7.227385253827379e-06, + "clip_ratio/low_mean": 5.609390495919797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.332129100883321e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14347.0, + "completions/mean_length": 7150.234375, + "completions/mean_terminated_length": 6928.62451171875, + "completions/min_length": 1548.0, + "completions/min_terminated_length": 1548.0, + "entropy": 0.8632503524422646, + "epoch": 0.49770009199632015, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004979084711521864, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 476289752.0, + "reward": 0.4765625, + "reward_std": 0.3369181156158447, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991059303284, + "sampling/importance_sampling_ratio/min": 0.0004304716712795198, + "sampling/sampling_logp_difference/max": 7.75062894821167, + "sampling/sampling_logp_difference/mean": 0.019658904522657394, + "step": 541 + }, + { + "clip_ratio/high_max": 2.5298505988757825e-05, + "clip_ratio/high_mean": 6.324626497189456e-06, + "clip_ratio/low_mean": 3.922748987861269e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.555211648948898e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 6855.6640625, + "completions/mean_terminated_length": 6704.4208984375, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.8328540697693825, + "epoch": 0.49862005519779207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003560611279681325, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 477186885.0, + "reward": 0.515625, + "reward_std": 0.2743411958217621, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998643398284912, + "sampling/importance_sampling_ratio/min": 0.00021035241661593318, + "sampling/sampling_logp_difference/max": 8.466726303100586, + "sampling/sampling_logp_difference/mean": 0.01880962960422039, + "step": 542 + }, + { + "clip_ratio/high_max": 8.90761498339998e-06, + "clip_ratio/high_mean": 2.226903745849995e-06, + "clip_ratio/low_mean": 5.487640487444878e-05, + "clip_ratio/low_min": 6.345177553157555e-06, + "clip_ratio/region_mean": 5.7103308108708006e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15880.0, + "completions/mean_length": 7117.1015625, + "completions/mean_terminated_length": 6818.1689453125, + "completions/min_length": 1067.0, + "completions/min_terminated_length": 1067.0, + "entropy": 0.9280833601951599, + "epoch": 0.49954001839926404, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037869063671678305, + "learning_rate": 1e-05, + "loss": 0.0773, + "num_tokens": 478121506.0, + "reward": 0.484375, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 3.256524507833092e-07, + "sampling/sampling_logp_difference/max": 14.937435150146484, + "sampling/sampling_logp_difference/mean": 0.0203043594956398, + "step": 543 + }, + { + "clip_ratio/high_max": 1.3482746680892888e-05, + "clip_ratio/high_mean": 3.370686670223222e-06, + "clip_ratio/low_mean": 3.976425330165512e-05, + "clip_ratio/low_min": 4.979286131856497e-06, + "clip_ratio/region_mean": 4.313493991503492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6885.7109375, + "completions/mean_terminated_length": 6734.94482421875, + "completions/min_length": 1184.0, + "completions/min_terminated_length": 1184.0, + "entropy": 0.9137701392173767, + "epoch": 0.500459981600736, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002787451259791851, + "learning_rate": 1e-05, + "loss": 0.0847, + "num_tokens": 479021365.0, + "reward": 0.5, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000042915344238, + "sampling/importance_sampling_ratio/min": 0.0013747947523370385, + "sampling/sampling_logp_difference/max": 6.589450836181641, + "sampling/sampling_logp_difference/mean": 0.02060278132557869, + "step": 544 + }, + { + "clip_ratio/high_max": 2.918380459959735e-05, + "clip_ratio/high_mean": 8.077826691987866e-06, + "clip_ratio/low_mean": 4.93504342102824e-05, + "clip_ratio/low_min": 5.1258921303087845e-06, + "clip_ratio/region_mean": 5.742826124333078e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15047.0, + "completions/mean_length": 7055.7265625, + "completions/mean_terminated_length": 6982.275390625, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "entropy": 1.1009352952241898, + "epoch": 0.5013799448022079, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005555091425776482, + "learning_rate": 1e-05, + "loss": 0.0225, + "num_tokens": 479951778.0, + "reward": 0.28125, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 2.7657671353154e-07, + "sampling/sampling_logp_difference/max": 15.100777626037598, + "sampling/sampling_logp_difference/mean": 0.02176634594798088, + "step": 545 + }, + { + "clip_ratio/high_max": 9.75229158939328e-06, + "clip_ratio/high_mean": 2.43807289734832e-06, + "clip_ratio/low_mean": 3.58120408918694e-05, + "clip_ratio/low_min": 5.571651399804978e-06, + "clip_ratio/region_mean": 3.825011424396507e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16100.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 6088.2109375, + "completions/mean_terminated_length": 6088.2109375, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.7534168809652328, + "epoch": 0.5022999080036799, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.00568060576915741, + "learning_rate": 1e-05, + "loss": 0.1423, + "num_tokens": 480749677.0, + "reward": 0.6484375, + "reward_std": 0.3729842007160187, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527931213379, + "sampling/importance_sampling_ratio/min": 0.0002166072663385421, + "sampling/sampling_logp_difference/max": 8.437424659729004, + "sampling/sampling_logp_difference/mean": 0.017093103379011154, + "step": 546 + }, + { + "clip_ratio/high_max": 1.821310434024781e-05, + "clip_ratio/high_mean": 4.5532760850619525e-06, + "clip_ratio/low_mean": 2.870424191314669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.325751754346129e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16029.0, + "completions/mean_length": 5638.8515625, + "completions/mean_terminated_length": 5380.96826171875, + "completions/min_length": 1352.0, + "completions/min_terminated_length": 1352.0, + "entropy": 0.8868100792169571, + "epoch": 0.5032198712051518, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019015485886484385, + "learning_rate": 1e-05, + "loss": 0.1025, + "num_tokens": 481489954.0, + "reward": 0.59375, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911904335022, + "sampling/importance_sampling_ratio/min": 0.0001796126161934808, + "sampling/sampling_logp_difference/max": 8.62470817565918, + "sampling/sampling_logp_difference/mean": 0.019102448597550392, + "step": 547 + }, + { + "clip_ratio/high_max": 2.3414544557454064e-05, + "clip_ratio/high_mean": 7.0229532411758555e-06, + "clip_ratio/low_mean": 3.169551814607985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8718471842003055e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15258.0, + "completions/mean_length": 6776.59375, + "completions/mean_terminated_length": 6624.095703125, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.9075161814689636, + "epoch": 0.5041398344066237, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004203350283205509, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 482375358.0, + "reward": 0.453125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999104738235474, + "sampling/importance_sampling_ratio/min": 0.0036098493728786707, + "sampling/sampling_logp_difference/max": 5.6320695877075195, + "sampling/sampling_logp_difference/mean": 0.019327163696289062, + "step": 548 + }, + { + "clip_ratio/high_max": 1.8746226487564854e-05, + "clip_ratio/high_mean": 5.84939061809564e-06, + "clip_ratio/low_mean": 3.6077018648938974e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.192640903966094e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15684.0, + "completions/mean_length": 7507.59375, + "completions/mean_terminated_length": 7071.048828125, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "entropy": 0.8015655726194382, + "epoch": 0.5050597976080957, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004891456104815006, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 483357450.0, + "reward": 0.3359375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999200701713562, + "sampling/importance_sampling_ratio/min": 0.0032753932755440474, + "sampling/sampling_logp_difference/max": 5.721317291259766, + "sampling/sampling_logp_difference/mean": 0.019086822867393494, + "step": 549 + }, + { + "clip_ratio/high_max": 2.4045971031227964e-05, + "clip_ratio/high_mean": 6.011492757806991e-06, + "clip_ratio/low_mean": 3.096040018135682e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.697189299600723e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 6061.3125, + "completions/mean_terminated_length": 5813.568359375, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.8335569724440575, + "epoch": 0.5059797608095676, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003564947983250022, + "learning_rate": 1e-05, + "loss": 0.028, + "num_tokens": 484153554.0, + "reward": 0.3984375, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999876022338867, + "sampling/importance_sampling_ratio/min": 0.02006213553249836, + "sampling/sampling_logp_difference/max": 3.908921003341675, + "sampling/sampling_logp_difference/mean": 0.018360145390033722, + "step": 550 + }, + { + "clip_ratio/high_max": 9.095339009945747e-06, + "clip_ratio/high_mean": 2.2738347524864366e-06, + "clip_ratio/low_mean": 4.612986276697484e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.840369865632965e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 7312.4921875, + "completions/mean_terminated_length": 7241.06298828125, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.9900097697973251, + "epoch": 0.5068997240110396, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0032013265881687403, + "learning_rate": 1e-05, + "loss": 0.0976, + "num_tokens": 485111601.0, + "reward": 0.3125, + "reward_std": 0.21040895581245422, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999306201934814, + "sampling/importance_sampling_ratio/min": 0.006552733480930328, + "sampling/sampling_logp_difference/max": 5.0278730392456055, + "sampling/sampling_logp_difference/mean": 0.020712960511446, + "step": 551 + }, + { + "clip_ratio/high_max": 1.360053283860907e-05, + "clip_ratio/high_mean": 4.2937051603075815e-06, + "clip_ratio/low_mean": 4.3424448904261226e-05, + "clip_ratio/low_min": 4.718405762105249e-06, + "clip_ratio/region_mean": 4.771815429194248e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14797.0, + "completions/max_terminated_length": 14797.0, + "completions/mean_length": 6571.4453125, + "completions/mean_terminated_length": 6571.4453125, + "completions/min_length": 951.0, + "completions/min_terminated_length": 951.0, + "entropy": 0.8801060244441032, + "epoch": 0.5078196872125115, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002972986316308379, + "learning_rate": 1e-05, + "loss": 0.0888, + "num_tokens": 485971554.0, + "reward": 0.5234375, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998995065689087, + "sampling/importance_sampling_ratio/min": 2.4590379325672984e-05, + "sampling/sampling_logp_difference/max": 10.613155364990234, + "sampling/sampling_logp_difference/mean": 0.020055105909705162, + "step": 552 + }, + { + "clip_ratio/high_max": 8.231255606006016e-06, + "clip_ratio/high_mean": 2.057813901501504e-06, + "clip_ratio/low_mean": 3.511405452627514e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.71718685983069e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 6879.2890625, + "completions/mean_terminated_length": 6728.4208984375, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.8452998399734497, + "epoch": 0.5087396504139834, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00798189826309681, + "learning_rate": 1e-05, + "loss": 0.0278, + "num_tokens": 486873791.0, + "reward": 0.4609375, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493956565857, + "sampling/importance_sampling_ratio/min": 0.005210345610976219, + "sampling/sampling_logp_difference/max": 5.25710916519165, + "sampling/sampling_logp_difference/mean": 0.02010834403336048, + "step": 553 + }, + { + "clip_ratio/high_max": 1.757707786964602e-05, + "clip_ratio/high_mean": 4.394269467411505e-06, + "clip_ratio/low_mean": 6.0756912262149854e-05, + "clip_ratio/low_min": 1.0878021839744179e-05, + "clip_ratio/region_mean": 6.51511809337535e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16237.0, + "completions/max_terminated_length": 16237.0, + "completions/mean_length": 7169.8828125, + "completions/mean_terminated_length": 7169.8828125, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.9671438857913017, + "epoch": 0.5096596136154554, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038661460857838392, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 487814936.0, + "reward": 0.3359375, + "reward_std": 0.23751862347126007, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 4.6830271458020434e-05, + "sampling/sampling_logp_difference/max": 9.96898078918457, + "sampling/sampling_logp_difference/mean": 0.02097059041261673, + "step": 554 + }, + { + "clip_ratio/high_max": 4.649260063160909e-06, + "clip_ratio/high_mean": 1.1623150157902273e-06, + "clip_ratio/low_mean": 3.180719090778439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2969506037261453e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 6945.0390625, + "completions/mean_terminated_length": 6870.71630859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9309702143073082, + "epoch": 0.5105795768169273, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002214127918705344, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 488720293.0, + "reward": 0.375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914169311523, + "sampling/importance_sampling_ratio/min": 0.00032080389792099595, + "sampling/sampling_logp_difference/max": 8.04468059539795, + "sampling/sampling_logp_difference/mean": 0.01968962326645851, + "step": 555 + }, + { + "clip_ratio/high_max": 1.5428002825501608e-05, + "clip_ratio/high_mean": 3.857000706375402e-06, + "clip_ratio/low_mean": 5.9988536690980254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.384553716998198e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5970.1015625, + "completions/mean_terminated_length": 5804.8017578125, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.8274230882525444, + "epoch": 0.5114995400183993, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026088031008839607, + "learning_rate": 1e-05, + "loss": 0.0919, + "num_tokens": 489504626.0, + "reward": 0.484375, + "reward_std": 0.3237725496292114, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999892711639404, + "sampling/importance_sampling_ratio/min": 0.00033548183273524046, + "sampling/sampling_logp_difference/max": 7.999942779541016, + "sampling/sampling_logp_difference/mean": 0.018132124096155167, + "step": 556 + }, + { + "clip_ratio/high_max": 1.628765676287003e-05, + "clip_ratio/high_mean": 5.032566036788921e-06, + "clip_ratio/low_mean": 3.257978141846252e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.761234722787776e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15636.0, + "completions/mean_length": 7099.578125, + "completions/mean_terminated_length": 6952.20654296875, + "completions/min_length": 567.0, + "completions/min_terminated_length": 567.0, + "entropy": 0.8690815567970276, + "epoch": 0.5124195032198712, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0040014018304646015, + "learning_rate": 1e-05, + "loss": 0.0021, + "num_tokens": 490431156.0, + "reward": 0.4609375, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368786811829, + "sampling/importance_sampling_ratio/min": 0.0007102031959220767, + "sampling/sampling_logp_difference/max": 7.249959468841553, + "sampling/sampling_logp_difference/mean": 0.02036934345960617, + "step": 557 + }, + { + "clip_ratio/high_max": 1.3314914440343273e-05, + "clip_ratio/high_mean": 3.3287286100858182e-06, + "clip_ratio/low_mean": 3.747020150512981e-05, + "clip_ratio/low_min": 3.852436293527717e-06, + "clip_ratio/region_mean": 4.079892983099853e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7253.296875, + "completions/mean_terminated_length": 6725.07421875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8692722395062447, + "epoch": 0.5133394664213431, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002252641599625349, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 491378450.0, + "reward": 0.328125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999855756759644, + "sampling/importance_sampling_ratio/min": 1.893525586638134e-05, + "sampling/sampling_logp_difference/max": 10.87448501586914, + "sampling/sampling_logp_difference/mean": 0.01926814392209053, + "step": 558 + }, + { + "clip_ratio/high_max": 3.51339258486405e-05, + "clip_ratio/high_mean": 1.0567253070803417e-05, + "clip_ratio/low_mean": 3.905345306520758e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962070602232416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 7827.0234375, + "completions/mean_terminated_length": 7406.18798828125, + "completions/min_length": 808.0, + "completions/min_terminated_length": 808.0, + "entropy": 0.9718392416834831, + "epoch": 0.5142594296228151, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023995323572307825, + "learning_rate": 1e-05, + "loss": 0.0684, + "num_tokens": 492398757.0, + "reward": 0.3359375, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999961256980896, + "sampling/importance_sampling_ratio/min": 0.0003522284678183496, + "sampling/sampling_logp_difference/max": 7.951230525970459, + "sampling/sampling_logp_difference/mean": 0.020725054666399956, + "step": 559 + }, + { + "clip_ratio/high_max": 9.237001677320222e-06, + "clip_ratio/high_mean": 2.3092504193300556e-06, + "clip_ratio/low_mean": 4.477454979223694e-05, + "clip_ratio/low_min": 3.5987793580716243e-06, + "clip_ratio/region_mean": 4.708380049578409e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14833.0, + "completions/max_terminated_length": 14833.0, + "completions/mean_length": 6578.53125, + "completions/mean_terminated_length": 6578.53125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.9265799149870872, + "epoch": 0.515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0053934333845973015, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 493259049.0, + "reward": 0.4140625, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999976396560669, + "sampling/importance_sampling_ratio/min": 1.5993017541404697e-06, + "sampling/sampling_logp_difference/max": 13.345943450927734, + "sampling/sampling_logp_difference/mean": 0.019497254863381386, + "step": 560 + }, + { + "clip_ratio/high_max": 6.991247119003674e-06, + "clip_ratio/high_mean": 2.580789669082151e-06, + "clip_ratio/low_mean": 4.2538599473118666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.511938891482714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15783.0, + "completions/mean_length": 7893.7734375, + "completions/mean_terminated_length": 7826.92138671875, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "entropy": 0.9697273746132851, + "epoch": 0.516099356025759, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003773769596591592, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 494288028.0, + "reward": 0.296875, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000444650650024, + "sampling/importance_sampling_ratio/min": 4.6216489863581955e-05, + "sampling/sampling_logp_difference/max": 9.982173919677734, + "sampling/sampling_logp_difference/mean": 0.020743828266859055, + "step": 561 + }, + { + "clip_ratio/high_max": 1.060595786839258e-05, + "clip_ratio/high_mean": 4.29665919909894e-06, + "clip_ratio/low_mean": 3.2997783137034276e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729444244982005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15024.0, + "completions/mean_length": 6483.7734375, + "completions/mean_terminated_length": 6405.81884765625, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.8293593674898148, + "epoch": 0.5170193192272309, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.006334445904940367, + "learning_rate": 1e-05, + "loss": 0.0217, + "num_tokens": 495135903.0, + "reward": 0.5, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999064207077026, + "sampling/importance_sampling_ratio/min": 0.0001236602693097666, + "sampling/sampling_logp_difference/max": 8.99797248840332, + "sampling/sampling_logp_difference/mean": 0.018669776618480682, + "step": 562 + }, + { + "clip_ratio/high_max": 9.357276894661481e-06, + "clip_ratio/high_mean": 2.3393192236653704e-06, + "clip_ratio/low_mean": 4.667806888392079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.901738748230855e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16230.0, + "completions/mean_length": 6484.546875, + "completions/mean_terminated_length": 6246.96044921875, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.7686850279569626, + "epoch": 0.5179392824287029, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003286323742941022, + "learning_rate": 1e-05, + "loss": 0.0865, + "num_tokens": 495986277.0, + "reward": 0.59375, + "reward_std": 0.3763991594314575, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945342540741, + "sampling/importance_sampling_ratio/min": 2.0216441043885425e-05, + "sampling/sampling_logp_difference/max": 10.809014320373535, + "sampling/sampling_logp_difference/mean": 0.018656805157661438, + "step": 563 + }, + { + "clip_ratio/high_max": 3.368905208844808e-05, + "clip_ratio/high_mean": 9.76577109668142e-06, + "clip_ratio/low_mean": 8.26880966542376e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8034580989478854e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 6411.3203125, + "completions/mean_terminated_length": 5746.47509765625, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 0.899998240172863, + "epoch": 0.5188592456301748, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005072349216789007, + "learning_rate": 1e-05, + "loss": -0.0049, + "num_tokens": 496826094.0, + "reward": 0.515625, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999135732650757, + "sampling/importance_sampling_ratio/min": 0.0038024066016077995, + "sampling/sampling_logp_difference/max": 5.5721211433410645, + "sampling/sampling_logp_difference/mean": 0.019648944959044456, + "step": 564 + }, + { + "clip_ratio/high_max": 1.726673963275971e-05, + "clip_ratio/high_mean": 6.2551004020861e-06, + "clip_ratio/low_mean": 4.834715275592316e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4602252930635586e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 7110.0546875, + "completions/mean_terminated_length": 6810.89501953125, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 1.0061073675751686, + "epoch": 0.5197792088316467, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005030680447816849, + "learning_rate": 1e-05, + "loss": 0.0871, + "num_tokens": 497756469.0, + "reward": 0.375, + "reward_std": 0.3253750801086426, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999985933303833, + "sampling/importance_sampling_ratio/min": 0.0004307488852646202, + "sampling/sampling_logp_difference/max": 7.749985218048096, + "sampling/sampling_logp_difference/mean": 0.02187274768948555, + "step": 565 + }, + { + "clip_ratio/high_max": 3.3920382520591374e-06, + "clip_ratio/high_mean": 8.480095630147844e-07, + "clip_ratio/low_mean": 2.627351494766117e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.712152416961544e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7546.484375, + "completions/mean_terminated_length": 7261.40283203125, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.898541085422039, + "epoch": 0.5206991720331187, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002894402015954256, + "learning_rate": 1e-05, + "loss": -0.0016, + "num_tokens": 498743411.0, + "reward": 0.25, + "reward_std": 0.2380426526069641, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998988509178162, + "sampling/importance_sampling_ratio/min": 3.340166585985571e-05, + "sampling/sampling_logp_difference/max": 10.306904792785645, + "sampling/sampling_logp_difference/mean": 0.019597206264734268, + "step": 566 + }, + { + "clip_ratio/high_max": 3.407480107853189e-06, + "clip_ratio/high_mean": 8.518700269632973e-07, + "clip_ratio/low_mean": 1.9815101950371172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.066697197733447e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15426.0, + "completions/mean_length": 6637.9296875, + "completions/mean_terminated_length": 6241.74755859375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "entropy": 0.9469815120100975, + "epoch": 0.5216191352345906, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033100086729973555, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 499612490.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999792575836182, + "sampling/importance_sampling_ratio/min": 0.000214192972634919, + "sampling/sampling_logp_difference/max": 8.448633193969727, + "sampling/sampling_logp_difference/mean": 0.019627269357442856, + "step": 567 + }, + { + "clip_ratio/high_max": 2.8962323767700582e-05, + "clip_ratio/high_mean": 7.2405809419251455e-06, + "clip_ratio/low_mean": 6.551078422489809e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.275136522366665e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15136.0, + "completions/mean_length": 6903.0859375, + "completions/mean_terminated_length": 6752.595703125, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.976447619497776, + "epoch": 0.5225390984360626, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006571728736162186, + "learning_rate": 1e-05, + "loss": 0.0543, + "num_tokens": 500515117.0, + "reward": 0.40625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.016446342691779137, + "sampling/sampling_logp_difference/max": 4.107652187347412, + "sampling/sampling_logp_difference/mean": 0.020653847604990005, + "step": 568 + }, + { + "clip_ratio/high_max": 1.4576415196643211e-05, + "clip_ratio/high_mean": 3.6441037991608027e-06, + "clip_ratio/low_mean": 7.513643731726916e-05, + "clip_ratio/low_min": 2.2551557776750997e-05, + "clip_ratio/region_mean": 7.878054020693526e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15556.0, + "completions/mean_length": 6953.8359375, + "completions/mean_terminated_length": 6570.49560546875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.8397975340485573, + "epoch": 0.5234590616375345, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007468517404049635, + "learning_rate": 1e-05, + "loss": 0.0618, + "num_tokens": 501427056.0, + "reward": 0.421875, + "reward_std": 0.3571978807449341, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.0001911464933073148, + "sampling/sampling_logp_difference/max": 8.562470436096191, + "sampling/sampling_logp_difference/mean": 0.01937997341156006, + "step": 569 + }, + { + "clip_ratio/high_max": 3.168922489749093e-05, + "clip_ratio/high_mean": 7.922306224372733e-06, + "clip_ratio/low_mean": 3.7468206755875144e-05, + "clip_ratio/low_min": 5.264044375508092e-06, + "clip_ratio/region_mean": 4.5390514060272835e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15961.0, + "completions/mean_length": 7807.09375, + "completions/mean_terminated_length": 7458.43896484375, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "entropy": 0.7974586114287376, + "epoch": 0.5243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004324767272919416, + "learning_rate": 1e-05, + "loss": 0.0431, + "num_tokens": 502445156.0, + "reward": 0.265625, + "reward_std": 0.3329663574695587, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999243021011353, + "sampling/importance_sampling_ratio/min": 2.9874459869461134e-05, + "sampling/sampling_logp_difference/max": 10.418506622314453, + "sampling/sampling_logp_difference/mean": 0.018592730164527893, + "step": 570 + }, + { + "clip_ratio/high_max": 1.8414293663227e-05, + "clip_ratio/high_mean": 5.567038670051261e-06, + "clip_ratio/low_mean": 3.436269958001503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9929738250066293e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 6467.890625, + "completions/mean_terminated_length": 6310.4921875, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "entropy": 0.8665193468332291, + "epoch": 0.5252989880404784, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0044867550022900105, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 503293398.0, + "reward": 0.4609375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.024881144985556602, + "sampling/sampling_logp_difference/max": 3.6936450004577637, + "sampling/sampling_logp_difference/mean": 0.019022464752197266, + "step": 571 + }, + { + "clip_ratio/high_max": 1.4845849818811985e-05, + "clip_ratio/high_mean": 3.711462454702996e-06, + "clip_ratio/low_mean": 3.597185968828853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.968332202930469e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16309.0, + "completions/mean_length": 6275.796875, + "completions/mean_terminated_length": 6115.349609375, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 0.8425783589482307, + "epoch": 0.5262189512419503, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033805551938712597, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 504115692.0, + "reward": 0.3984375, + "reward_std": 0.2569621503353119, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000152587890625, + "sampling/importance_sampling_ratio/min": 0.018389537930488586, + "sampling/sampling_logp_difference/max": 3.9959733486175537, + "sampling/sampling_logp_difference/mean": 0.018935590982437134, + "step": 572 + }, + { + "clip_ratio/high_max": 4.3129479763592826e-05, + "clip_ratio/high_mean": 1.3471904480866215e-05, + "clip_ratio/low_mean": 1.670091853611666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0172822903296037e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16116.0, + "completions/mean_length": 5396.7890625, + "completions/mean_terminated_length": 5222.38916015625, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.8558806329965591, + "epoch": 0.5271389144434223, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00652205478399992, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 504826577.0, + "reward": 0.546875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.0017056812066584826, + "sampling/sampling_logp_difference/max": 6.373790740966797, + "sampling/sampling_logp_difference/mean": 0.018737314268946648, + "step": 573 + }, + { + "clip_ratio/high_max": 6.914692676218692e-06, + "clip_ratio/high_mean": 1.728673169054673e-06, + "clip_ratio/low_mean": 2.3435458388121333e-05, + "clip_ratio/low_min": 3.954319709009724e-06, + "clip_ratio/region_mean": 2.5164132239297032e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16298.0, + "completions/mean_length": 7798.9765625, + "completions/mean_terminated_length": 6991.837890625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.8846152648329735, + "epoch": 0.5280588776448942, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018958896398544312, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 505846438.0, + "reward": 0.328125, + "reward_std": 0.21253062784671783, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999515414237976, + "sampling/importance_sampling_ratio/min": 2.434831731079612e-05, + "sampling/sampling_logp_difference/max": 10.623047828674316, + "sampling/sampling_logp_difference/mean": 0.019361287355422974, + "step": 574 + }, + { + "clip_ratio/high_max": 1.085428675651201e-05, + "clip_ratio/high_mean": 5.064732249593362e-06, + "clip_ratio/low_mean": 5.590463968019321e-05, + "clip_ratio/low_min": 4.822531082027126e-06, + "clip_ratio/region_mean": 6.096937283928128e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16280.0, + "completions/mean_length": 6272.5546875, + "completions/mean_terminated_length": 6029.88037109375, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "entropy": 0.9714803844690323, + "epoch": 0.5289788408463661, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003035407979041338, + "learning_rate": 1e-05, + "loss": 0.1295, + "num_tokens": 506670477.0, + "reward": 0.3984375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212026596069, + "sampling/importance_sampling_ratio/min": 0.0012103202752768993, + "sampling/sampling_logp_difference/max": 6.716870307922363, + "sampling/sampling_logp_difference/mean": 0.019988738000392914, + "step": 575 + }, + { + "clip_ratio/high_max": 2.1176599602767965e-05, + "clip_ratio/high_mean": 5.294149900691991e-06, + "clip_ratio/low_mean": 4.479086726405512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.008501784686814e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6060.75, + "completions/mean_terminated_length": 5896.88916015625, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 0.8791732639074326, + "epoch": 0.5298988040478381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005080445669591427, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 507471717.0, + "reward": 0.421875, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999859929084778, + "sampling/importance_sampling_ratio/min": 0.0025768836494535208, + "sampling/sampling_logp_difference/max": 5.961174488067627, + "sampling/sampling_logp_difference/mean": 0.019146449863910675, + "step": 576 + }, + { + "clip_ratio/high_max": 1.591328441463702e-05, + "clip_ratio/high_mean": 3.978321103659255e-06, + "clip_ratio/low_mean": 3.991827338722942e-05, + "clip_ratio/low_min": 4.394445568323135e-06, + "clip_ratio/region_mean": 4.389659511616628e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7221.65625, + "completions/mean_terminated_length": 7149.51171875, + "completions/min_length": 1071.0, + "completions/min_terminated_length": 1071.0, + "entropy": 0.9068904295563698, + "epoch": 0.53081876724931, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002491918858140707, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 508420417.0, + "reward": 0.3046875, + "reward_std": 0.22908622026443481, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999144077301025, + "sampling/importance_sampling_ratio/min": 0.0010015364969149232, + "sampling/sampling_logp_difference/max": 6.906219959259033, + "sampling/sampling_logp_difference/mean": 0.019857721403241158, + "step": 577 + }, + { + "clip_ratio/high_max": 2.723786337810452e-06, + "clip_ratio/high_mean": 6.80946584452613e-07, + "clip_ratio/low_mean": 4.729307283923845e-05, + "clip_ratio/low_min": 3.3817600524344016e-06, + "clip_ratio/region_mean": 4.7974018798413454e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16090.0, + "completions/mean_length": 7279.765625, + "completions/mean_terminated_length": 6909.67431640625, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "entropy": 0.7393763959407806, + "epoch": 0.531738730450782, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0038857783656567335, + "learning_rate": 1e-05, + "loss": 0.1167, + "num_tokens": 509367579.0, + "reward": 0.5703125, + "reward_std": 0.3782213628292084, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372959136963, + "sampling/importance_sampling_ratio/min": 8.482332486892119e-05, + "sampling/sampling_logp_difference/max": 9.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01783195324242115, + "step": 578 + }, + { + "clip_ratio/high_max": 2.4269288587674964e-05, + "clip_ratio/high_mean": 6.067322146918741e-06, + "clip_ratio/low_mean": 5.770765028501046e-05, + "clip_ratio/low_min": 6.032236342434771e-06, + "clip_ratio/region_mean": 6.377497174980817e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15946.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 5381.4375, + "completions/mean_terminated_length": 5381.4375, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.8337196409702301, + "epoch": 0.5326586936522539, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.004505726508796215, + "learning_rate": 1e-05, + "loss": 0.1534, + "num_tokens": 510076403.0, + "reward": 0.484375, + "reward_std": 0.3861297369003296, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999825358390808, + "sampling/importance_sampling_ratio/min": 0.0021874941885471344, + "sampling/sampling_logp_difference/max": 6.124998569488525, + "sampling/sampling_logp_difference/mean": 0.019285976886749268, + "step": 579 + }, + { + "clip_ratio/high_max": 1.83111833393923e-05, + "clip_ratio/high_mean": 4.577795834848075e-06, + "clip_ratio/low_mean": 4.1738339632502175e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.631613546735025e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15789.0, + "completions/mean_length": 8440.7109375, + "completions/mean_terminated_length": 8250.072265625, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "entropy": 0.8920768201351166, + "epoch": 0.5335786568537259, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0039497604593634605, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 511177974.0, + "reward": 0.1875, + "reward_std": 0.18990950286388397, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910831451416, + "sampling/importance_sampling_ratio/min": 0.00021938055579084903, + "sampling/sampling_logp_difference/max": 8.424702644348145, + "sampling/sampling_logp_difference/mean": 0.020451124757528305, + "step": 580 + }, + { + "clip_ratio/high_max": 1.371111534353986e-05, + "clip_ratio/high_mean": 3.427778835884965e-06, + "clip_ratio/low_mean": 4.171912905803765e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.514690772339236e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16077.0, + "completions/mean_length": 6702.3828125, + "completions/mean_terminated_length": 6470.0244140625, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.8600481152534485, + "epoch": 0.5344986200551978, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024386425502598286, + "learning_rate": 1e-05, + "loss": 0.0866, + "num_tokens": 512054655.0, + "reward": 0.5703125, + "reward_std": 0.26645052433013916, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000202655792236, + "sampling/importance_sampling_ratio/min": 0.0015237311599776149, + "sampling/sampling_logp_difference/max": 6.486593246459961, + "sampling/sampling_logp_difference/mean": 0.018986206501722336, + "step": 581 + }, + { + "clip_ratio/high_max": 9.279537152906414e-06, + "clip_ratio/high_mean": 4.2680171645770315e-06, + "clip_ratio/low_mean": 2.6773893978315755e-05, + "clip_ratio/low_min": 4.736104074254399e-06, + "clip_ratio/region_mean": 3.1041911142892786e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13410.0, + "completions/mean_length": 4845.953125, + "completions/mean_terminated_length": 4755.1025390625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.9067303538322449, + "epoch": 0.5354185832566697, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0072782449424266815, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 512696537.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999409317970276, + "sampling/importance_sampling_ratio/min": 0.017822081223130226, + "sampling/sampling_logp_difference/max": 4.027317047119141, + "sampling/sampling_logp_difference/mean": 0.01862735114991665, + "step": 582 + }, + { + "clip_ratio/high_max": 8.41807559481822e-06, + "clip_ratio/high_mean": 2.104518898704555e-06, + "clip_ratio/low_mean": 4.360654588708712e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5711064331044327e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16282.0, + "completions/mean_length": 6173.171875, + "completions/mean_terminated_length": 6011.095703125, + "completions/min_length": 756.0, + "completions/min_terminated_length": 756.0, + "entropy": 0.9604142308235168, + "epoch": 0.5363385464581417, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005057654343545437, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 513505135.0, + "reward": 0.4375, + "reward_std": 0.2767051160335541, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999635219573975, + "sampling/importance_sampling_ratio/min": 0.0002380619989708066, + "sampling/sampling_logp_difference/max": 8.342979431152344, + "sampling/sampling_logp_difference/mean": 0.020879898220300674, + "step": 583 + }, + { + "clip_ratio/high_max": 7.327939783863258e-06, + "clip_ratio/high_mean": 3.227510205761064e-06, + "clip_ratio/low_mean": 4.2579683963595016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.580719428304292e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15173.0, + "completions/mean_length": 5546.5234375, + "completions/mean_terminated_length": 5374.50048828125, + "completions/min_length": 1113.0, + "completions/min_terminated_length": 1113.0, + "entropy": 0.8015405982732773, + "epoch": 0.5372585096596136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0047672707587480545, + "learning_rate": 1e-05, + "loss": 0.0991, + "num_tokens": 514232058.0, + "reward": 0.4921875, + "reward_std": 0.27038949728012085, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 5.8323133998783305e-05, + "sampling/sampling_logp_difference/max": 9.74951171875, + "sampling/sampling_logp_difference/mean": 0.018185433000326157, + "step": 584 + }, + { + "clip_ratio/high_max": 1.3804907666781219e-05, + "clip_ratio/high_mean": 4.388961428958282e-06, + "clip_ratio/low_mean": 5.04182496570138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.480721097228525e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15778.0, + "completions/mean_length": 6637.359375, + "completions/mean_terminated_length": 6482.6513671875, + "completions/min_length": 1144.0, + "completions/min_terminated_length": 1144.0, + "entropy": 1.0173144191503525, + "epoch": 0.5381784728610856, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005850035231560469, + "learning_rate": 1e-05, + "loss": 0.0453, + "num_tokens": 515103184.0, + "reward": 0.3046875, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999963104724884, + "sampling/importance_sampling_ratio/min": 1.4479226706498594e-07, + "sampling/sampling_logp_difference/max": 15.747965812683105, + "sampling/sampling_logp_difference/mean": 0.020641878247261047, + "step": 585 + }, + { + "clip_ratio/high_max": 1.594428704265738e-05, + "clip_ratio/high_mean": 3.986071760664345e-06, + "clip_ratio/low_mean": 5.566071547491447e-05, + "clip_ratio/low_min": 8.978264304460026e-06, + "clip_ratio/region_mean": 5.964678746295249e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 6940.6171875, + "completions/mean_terminated_length": 6866.259765625, + "completions/min_length": 1273.0, + "completions/min_terminated_length": 1273.0, + "entropy": 0.8547529205679893, + "epoch": 0.5390984360625575, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037875184789299965, + "learning_rate": 1e-05, + "loss": 0.0831, + "num_tokens": 516009791.0, + "reward": 0.4765625, + "reward_std": 0.27222442626953125, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999997615814209, + "sampling/importance_sampling_ratio/min": 5.772008080384694e-06, + "sampling/sampling_logp_difference/max": 12.062490463256836, + "sampling/sampling_logp_difference/mean": 0.018527517095208168, + "step": 586 + }, + { + "clip_ratio/high_max": 6.924382887518732e-06, + "clip_ratio/high_mean": 1.731095721879683e-06, + "clip_ratio/low_mean": 3.340147941344185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5132575476382044e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15387.0, + "completions/mean_length": 6837.125, + "completions/mean_terminated_length": 6761.95263671875, + "completions/min_length": 1319.0, + "completions/min_terminated_length": 1319.0, + "entropy": 0.9027494043111801, + "epoch": 0.5400183992640294, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015506440540775657, + "learning_rate": 1e-05, + "loss": 0.0502, + "num_tokens": 516903335.0, + "reward": 0.296875, + "reward_std": 0.20593318343162537, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 4.2636147554730996e-05, + "sampling/sampling_logp_difference/max": 10.0628080368042, + "sampling/sampling_logp_difference/mean": 0.020130250602960587, + "step": 587 + }, + { + "clip_ratio/high_max": 1.2774215747413109e-05, + "clip_ratio/high_mean": 3.1935539368532773e-06, + "clip_ratio/low_mean": 3.885528553837503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.204883930469805e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7866.703125, + "completions/mean_terminated_length": 7222.5380859375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.8133657574653625, + "epoch": 0.5409383624655014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003520917845889926, + "learning_rate": 1e-05, + "loss": 0.1165, + "num_tokens": 517929081.0, + "reward": 0.4453125, + "reward_std": 0.3316730856895447, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 6.223546370165423e-05, + "sampling/sampling_logp_difference/max": 9.684585571289062, + "sampling/sampling_logp_difference/mean": 0.01890747994184494, + "step": 588 + }, + { + "clip_ratio/high_max": 6.942207619431429e-06, + "clip_ratio/high_mean": 1.7355519048578572e-06, + "clip_ratio/low_mean": 3.457626269209868e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.631181459695654e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 6701.296875, + "completions/mean_terminated_length": 6547.603515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9360691756010056, + "epoch": 0.5418583256669733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029796145390719175, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 518810247.0, + "reward": 0.3359375, + "reward_std": 0.2869499921798706, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 2.520391673144218e-10, + "sampling/sampling_logp_difference/max": 22.101436614990234, + "sampling/sampling_logp_difference/mean": 0.01977725327014923, + "step": 589 + }, + { + "clip_ratio/high_max": 3.7906356737948954e-06, + "clip_ratio/high_mean": 9.476589184487239e-07, + "clip_ratio/low_mean": 3.738725240509666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8334911323545384e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15971.0, + "completions/mean_length": 7029.453125, + "completions/mean_terminated_length": 6804.9443359375, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.9168537557125092, + "epoch": 0.5427782888684453, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024249793495982885, + "learning_rate": 1e-05, + "loss": 0.0477, + "num_tokens": 519730577.0, + "reward": 0.390625, + "reward_std": 0.22803518176078796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 1.6278204384434503e-07, + "sampling/sampling_logp_difference/max": 15.630853652954102, + "sampling/sampling_logp_difference/mean": 0.01923082396388054, + "step": 590 + }, + { + "clip_ratio/high_max": 2.4759768621152034e-05, + "clip_ratio/high_mean": 6.1899421552880085e-06, + "clip_ratio/low_mean": 3.2254738812298456e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8444680967586464e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 7255.453125, + "completions/mean_terminated_length": 6646.8837890625, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "entropy": 0.8241118341684341, + "epoch": 0.5436982520699172, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003160425927489996, + "learning_rate": 1e-05, + "loss": 0.0821, + "num_tokens": 520680707.0, + "reward": 0.3359375, + "reward_std": 0.2461756467819214, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000334978103638, + "sampling/importance_sampling_ratio/min": 0.0009408618789166212, + "sampling/sampling_logp_difference/max": 6.968714237213135, + "sampling/sampling_logp_difference/mean": 0.019255205988883972, + "step": 591 + }, + { + "clip_ratio/high_max": 7.459808557541692e-06, + "clip_ratio/high_mean": 1.864952139385423e-06, + "clip_ratio/low_mean": 3.9836502310208743e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.170145416537707e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 7819.96875, + "completions/mean_terminated_length": 7752.53564453125, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 1.1218742430210114, + "epoch": 0.5446182152713891, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00411194609478116, + "learning_rate": 1e-05, + "loss": 0.0267, + "num_tokens": 521703303.0, + "reward": 0.2265625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999041557312012, + "sampling/importance_sampling_ratio/min": 0.0003571478300727904, + "sampling/sampling_logp_difference/max": 7.937360763549805, + "sampling/sampling_logp_difference/mean": 0.022727783769369125, + "step": 592 + }, + { + "clip_ratio/high_max": 1.8858649582398357e-05, + "clip_ratio/high_mean": 4.714662395599589e-06, + "clip_ratio/low_mean": 3.738353416338214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2098196558981726e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16117.0, + "completions/mean_length": 6322.8671875, + "completions/mean_terminated_length": 6163.1669921875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 0.8323960080742836, + "epoch": 0.5455381784728611, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022753921803086996, + "learning_rate": 1e-05, + "loss": 0.0339, + "num_tokens": 522531422.0, + "reward": 0.4140625, + "reward_std": 0.20753081142902374, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998952150344849, + "sampling/importance_sampling_ratio/min": 5.422274170996388e-06, + "sampling/sampling_logp_difference/max": 12.124995231628418, + "sampling/sampling_logp_difference/mean": 0.01893780007958412, + "step": 593 + }, + { + "clip_ratio/high_max": 3.977598225901602e-06, + "clip_ratio/high_mean": 9.943995564754005e-07, + "clip_ratio/low_mean": 1.1187657776190463e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.2182057332665863e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16055.0, + "completions/mean_length": 7054.0625, + "completions/mean_terminated_length": 6905.96875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.866028867661953, + "epoch": 0.546458141674333, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004338000901043415, + "learning_rate": 1e-05, + "loss": -0.0134, + "num_tokens": 523453262.0, + "reward": 0.328125, + "reward_std": 0.13204573094844818, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998721480369568, + "sampling/importance_sampling_ratio/min": 7.97068714746274e-05, + "sampling/sampling_logp_difference/max": 9.437154769897461, + "sampling/sampling_logp_difference/mean": 0.01982954889535904, + "step": 594 + }, + { + "clip_ratio/high_max": 1.5038514220577781e-05, + "clip_ratio/high_mean": 3.7596285551444453e-06, + "clip_ratio/low_mean": 3.533169467573316e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9091323742468376e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 7539.0703125, + "completions/mean_terminated_length": 7027.3798828125, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.8601142391562462, + "epoch": 0.547378104875805, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003401415189728141, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 524436831.0, + "reward": 0.4140625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999969482421875, + "sampling/importance_sampling_ratio/min": 2.0915547793265432e-05, + "sampling/sampling_logp_difference/max": 10.775017738342285, + "sampling/sampling_logp_difference/mean": 0.019884679466485977, + "step": 595 + }, + { + "clip_ratio/high_max": 2.9679867111553904e-05, + "clip_ratio/high_mean": 8.187421713046206e-06, + "clip_ratio/low_mean": 5.44505830930575e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.263800514716422e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16343.0, + "completions/mean_length": 7137.96875, + "completions/mean_terminated_length": 6762.11376953125, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.7909424379467964, + "epoch": 0.5482980680772769, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002879115054383874, + "learning_rate": 1e-05, + "loss": 0.0549, + "num_tokens": 525368091.0, + "reward": 0.546875, + "reward_std": 0.27062684297561646, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000025033950806, + "sampling/importance_sampling_ratio/min": 0.0004618439415935427, + "sampling/sampling_logp_difference/max": 7.680283546447754, + "sampling/sampling_logp_difference/mean": 0.01847894862294197, + "step": 596 + }, + { + "clip_ratio/high_max": 5.765416517533595e-06, + "clip_ratio/high_mean": 1.4413541293833987e-06, + "clip_ratio/low_mean": 3.1269102407804894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2710456423501455e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5486.3671875, + "completions/mean_terminated_length": 5224.82421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9588652476668358, + "epoch": 0.5492180312787488, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004545152187347412, + "learning_rate": 1e-05, + "loss": 0.0549, + "num_tokens": 526095378.0, + "reward": 0.359375, + "reward_std": 0.33508801460266113, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998891353607178, + "sampling/importance_sampling_ratio/min": 6.280510569922626e-05, + "sampling/sampling_logp_difference/max": 9.675474166870117, + "sampling/sampling_logp_difference/mean": 0.02017204463481903, + "step": 597 + }, + { + "clip_ratio/high_max": 1.519483475931338e-05, + "clip_ratio/high_mean": 4.732241109195456e-06, + "clip_ratio/low_mean": 4.477498589494644e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.950722734520241e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16169.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 6636.0078125, + "completions/mean_terminated_length": 6636.0078125, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.9497648254036903, + "epoch": 0.5501379944802208, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004040954168885946, + "learning_rate": 1e-05, + "loss": 0.0477, + "num_tokens": 526969459.0, + "reward": 0.3515625, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 2.2340275407373156e-08, + "sampling/sampling_logp_difference/max": 17.61687469482422, + "sampling/sampling_logp_difference/mean": 0.02086419239640236, + "step": 598 + }, + { + "clip_ratio/high_max": 1.5785165032866644e-05, + "clip_ratio/high_mean": 3.946291258216661e-06, + "clip_ratio/low_mean": 4.7215530003086315e-05, + "clip_ratio/low_min": 5.274039267533226e-06, + "clip_ratio/region_mean": 5.116182205711084e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15820.0, + "completions/mean_length": 6462.953125, + "completions/mean_terminated_length": 6142.9189453125, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "entropy": 0.9401230812072754, + "epoch": 0.5510579576816927, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004678349941968918, + "learning_rate": 1e-05, + "loss": 0.1854, + "num_tokens": 527822197.0, + "reward": 0.5234375, + "reward_std": 0.3345640003681183, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997877478599548, + "sampling/importance_sampling_ratio/min": 2.8560234568431042e-05, + "sampling/sampling_logp_difference/max": 10.463495254516602, + "sampling/sampling_logp_difference/mean": 0.019832316786050797, + "step": 599 + }, + { + "clip_ratio/high_max": 4.1415414671064354e-06, + "clip_ratio/high_mean": 1.0353853667766089e-06, + "clip_ratio/low_mean": 4.795687004843785e-05, + "clip_ratio/low_min": 7.76807610236574e-06, + "clip_ratio/region_mean": 4.899225518784078e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15170.0, + "completions/mean_length": 7172.1015625, + "completions/mean_terminated_length": 6951.01611328125, + "completions/min_length": 1079.0, + "completions/min_terminated_length": 1079.0, + "entropy": 0.7962061613798141, + "epoch": 0.5519779208831647, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014094997895881534, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 528759458.0, + "reward": 0.3515625, + "reward_std": 0.16834919154644012, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999281167984009, + "sampling/importance_sampling_ratio/min": 0.001331693259999156, + "sampling/sampling_logp_difference/max": 6.621304035186768, + "sampling/sampling_logp_difference/mean": 0.018519852310419083, + "step": 600 + }, + { + "clip_ratio/high_max": 7.3846517807396594e-06, + "clip_ratio/high_mean": 3.018199095095042e-06, + "clip_ratio/low_mean": 5.2064756346226204e-05, + "clip_ratio/low_min": 5.341652013157727e-06, + "clip_ratio/region_mean": 5.5082955441321246e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16195.0, + "completions/mean_length": 6612.6484375, + "completions/mean_terminated_length": 6378.13623046875, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.8218385726213455, + "epoch": 0.5528978840846366, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038943374529480934, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 529626893.0, + "reward": 0.390625, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 0.0024450027849525213, + "sampling/sampling_logp_difference/max": 6.01370906829834, + "sampling/sampling_logp_difference/mean": 0.018441151827573776, + "step": 601 + }, + { + "clip_ratio/high_max": 8.209965471905889e-06, + "clip_ratio/high_mean": 2.0524913679764722e-06, + "clip_ratio/low_mean": 4.8717710285473004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.077020244925734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15898.0, + "completions/mean_length": 6574.9140625, + "completions/mean_terminated_length": 6419.21484375, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9268836230039597, + "epoch": 0.5538178472861086, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027088895440101624, + "learning_rate": 1e-05, + "loss": 0.0577, + "num_tokens": 530486578.0, + "reward": 0.4453125, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000026822090149, + "sampling/importance_sampling_ratio/min": 1.1735714906535577e-05, + "sampling/sampling_logp_difference/max": 11.352873802185059, + "sampling/sampling_logp_difference/mean": 0.020115964114665985, + "step": 602 + }, + { + "clip_ratio/high_max": 5.24967435922008e-06, + "clip_ratio/high_mean": 1.31241858980502e-06, + "clip_ratio/low_mean": 1.3909025255998131e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5221443845803151e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14361.0, + "completions/mean_length": 6209.1953125, + "completions/mean_terminated_length": 6129.07861328125, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9574517607688904, + "epoch": 0.5547378104875805, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.002628365531563759, + "learning_rate": 1e-05, + "loss": 0.0461, + "num_tokens": 531303083.0, + "reward": 0.3671875, + "reward_std": 0.13098490238189697, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998608827590942, + "sampling/importance_sampling_ratio/min": 2.862734254449606e-05, + "sampling/sampling_logp_difference/max": 10.461148262023926, + "sampling/sampling_logp_difference/mean": 0.019658785313367844, + "step": 603 + }, + { + "clip_ratio/high_max": 1.9014597455679905e-05, + "clip_ratio/high_mean": 4.753649363919976e-06, + "clip_ratio/low_mean": 4.9158792762682424e-05, + "clip_ratio/low_min": 4.514427928370424e-06, + "clip_ratio/region_mean": 5.39124412171077e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13873.0, + "completions/mean_length": 7079.1875, + "completions/mean_terminated_length": 6855.87255859375, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 0.853938102722168, + "epoch": 0.5556577736890524, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004664157051593065, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 532228227.0, + "reward": 0.2734375, + "reward_std": 0.30327796936035156, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999879598617554, + "sampling/importance_sampling_ratio/min": 5.377535785555665e-07, + "sampling/sampling_logp_difference/max": 14.43586540222168, + "sampling/sampling_logp_difference/mean": 0.018260695040225983, + "step": 604 + }, + { + "clip_ratio/high_max": 3.025483556484687e-05, + "clip_ratio/high_mean": 7.563708891211718e-06, + "clip_ratio/low_mean": 2.1738228269896354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9301936820047558e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15094.0, + "completions/max_terminated_length": 15094.0, + "completions/mean_length": 6071.5390625, + "completions/mean_terminated_length": 6071.5390625, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "entropy": 0.980722151696682, + "epoch": 0.5565777368905244, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004579839296638966, + "learning_rate": 1e-05, + "loss": 0.0168, + "num_tokens": 533024264.0, + "reward": 0.4765625, + "reward_std": 0.30327799916267395, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999982476234436, + "sampling/importance_sampling_ratio/min": 0.0003390153287909925, + "sampling/sampling_logp_difference/max": 7.989465236663818, + "sampling/sampling_logp_difference/mean": 0.01974770799279213, + "step": 605 + }, + { + "clip_ratio/high_max": 1.3344870239961892e-05, + "clip_ratio/high_mean": 4.773990667672479e-06, + "clip_ratio/low_mean": 5.142044130934664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6194432318079635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7352.484375, + "completions/mean_terminated_length": 7209.12744140625, + "completions/min_length": 1310.0, + "completions/min_terminated_length": 1310.0, + "entropy": 0.7858814746141434, + "epoch": 0.5574977000919963, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002537919208407402, + "learning_rate": 1e-05, + "loss": 0.0576, + "num_tokens": 533985318.0, + "reward": 0.3125, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037981033325, + "sampling/importance_sampling_ratio/min": 0.0017827138071879745, + "sampling/sampling_logp_difference/max": 6.329618453979492, + "sampling/sampling_logp_difference/mean": 0.018647275865077972, + "step": 606 + }, + { + "clip_ratio/high_max": 2.345925531699322e-05, + "clip_ratio/high_mean": 7.0977013137962786e-06, + "clip_ratio/low_mean": 4.466222731025482e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.175992941985896e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16082.0, + "completions/mean_length": 7095.1875, + "completions/mean_terminated_length": 6947.74658203125, + "completions/min_length": 1073.0, + "completions/min_terminated_length": 1073.0, + "entropy": 0.6846291124820709, + "epoch": 0.5584176632934683, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037982286885380745, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 534912558.0, + "reward": 0.53125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147057533264, + "sampling/importance_sampling_ratio/min": 8.089523180387914e-05, + "sampling/sampling_logp_difference/max": 9.422355651855469, + "sampling/sampling_logp_difference/mean": 0.01693977229297161, + "step": 607 + }, + { + "clip_ratio/high_max": 5.167851668375079e-06, + "clip_ratio/high_mean": 1.2919629170937696e-06, + "clip_ratio/low_mean": 6.557838094067847e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.687034363039857e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6038.1953125, + "completions/mean_terminated_length": 5873.9765625, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "entropy": 0.8637901693582535, + "epoch": 0.5593376264949402, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030545955523848534, + "learning_rate": 1e-05, + "loss": 0.0716, + "num_tokens": 535707127.0, + "reward": 0.5078125, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999387264251709, + "sampling/importance_sampling_ratio/min": 0.00017956242663785815, + "sampling/sampling_logp_difference/max": 8.624987602233887, + "sampling/sampling_logp_difference/mean": 0.018705151975154877, + "step": 608 + }, + { + "clip_ratio/high_max": 1.7691760149318725e-05, + "clip_ratio/high_mean": 5.544901910070621e-06, + "clip_ratio/low_mean": 5.012885230826214e-05, + "clip_ratio/low_min": 3.5653165468829684e-06, + "clip_ratio/region_mean": 5.5673754559393274e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14906.0, + "completions/mean_length": 6978.0078125, + "completions/mean_terminated_length": 6828.70654296875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.7931060045957565, + "epoch": 0.5602575896964122, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002951717935502529, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 536618376.0, + "reward": 0.46875, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 3.865327380481176e-05, + "sampling/sampling_logp_difference/max": 10.160879135131836, + "sampling/sampling_logp_difference/mean": 0.018486514687538147, + "step": 609 + }, + { + "clip_ratio/high_max": 2.1591150925814873e-05, + "clip_ratio/high_mean": 5.397787731453718e-06, + "clip_ratio/low_mean": 6.101864732954709e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.6416435629435e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15329.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6810.15625, + "completions/mean_terminated_length": 6810.15625, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.8957240954041481, + "epoch": 0.5611775528978841, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019385438645258546, + "learning_rate": 1e-05, + "loss": 0.0973, + "num_tokens": 537513876.0, + "reward": 0.328125, + "reward_std": 0.28011518716812134, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000025749206543, + "sampling/importance_sampling_ratio/min": 4.845474904868752e-05, + "sampling/sampling_logp_difference/max": 9.934880256652832, + "sampling/sampling_logp_difference/mean": 0.02021351456642151, + "step": 610 + }, + { + "clip_ratio/high_max": 1.4817902865615906e-05, + "clip_ratio/high_mean": 5.914362077419355e-06, + "clip_ratio/low_mean": 1.2616926369446446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8531288333178964e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16065.0, + "completions/mean_length": 6940.4140625, + "completions/mean_terminated_length": 6713.7685546875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.8646975234150887, + "epoch": 0.562097516099356, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001886329147964716, + "learning_rate": 1e-05, + "loss": 0.0319, + "num_tokens": 538419265.0, + "reward": 0.375, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000052452087402, + "sampling/importance_sampling_ratio/min": 6.893687327647058e-07, + "sampling/sampling_logp_difference/max": 14.18748950958252, + "sampling/sampling_logp_difference/mean": 0.019072774797677994, + "step": 611 + }, + { + "clip_ratio/high_max": 6.3681300161988474e-06, + "clip_ratio/high_mean": 1.5920325040497119e-06, + "clip_ratio/low_mean": 3.254086982451554e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4132902555938927e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15960.0, + "completions/mean_length": 7508.796875, + "completions/mean_terminated_length": 6995.35498046875, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.7723299860954285, + "epoch": 0.563017479300828, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002031022449955344, + "learning_rate": 1e-05, + "loss": 0.0335, + "num_tokens": 539399127.0, + "reward": 0.4296875, + "reward_std": 0.2301519513130188, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0056421491317451, + "sampling/sampling_logp_difference/max": 5.177490234375, + "sampling/sampling_logp_difference/mean": 0.01832709088921547, + "step": 612 + }, + { + "clip_ratio/high_max": 1.5848977909627138e-05, + "clip_ratio/high_mean": 3.9622444774067844e-06, + "clip_ratio/low_mean": 2.6742804038804024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.070504851621081e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15816.0, + "completions/mean_length": 6019.6484375, + "completions/mean_terminated_length": 5938.03955078125, + "completions/min_length": 1020.0, + "completions/min_terminated_length": 1020.0, + "entropy": 0.7425512671470642, + "epoch": 0.5639374425022999, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003653773572295904, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 540189602.0, + "reward": 0.53125, + "reward_std": 0.26143303513526917, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999122619628906, + "sampling/importance_sampling_ratio/min": 0.005288486368954182, + "sampling/sampling_logp_difference/max": 5.242223262786865, + "sampling/sampling_logp_difference/mean": 0.017161473631858826, + "step": 613 + }, + { + "clip_ratio/high_max": 1.1017190900020069e-05, + "clip_ratio/high_mean": 2.754297725005017e-06, + "clip_ratio/low_mean": 3.428678644468164e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7041084169686656e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15861.0, + "completions/mean_length": 7155.6953125, + "completions/mean_terminated_length": 6621.826171875, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "entropy": 0.9789249897003174, + "epoch": 0.5648574057037719, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003739065257832408, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 541125587.0, + "reward": 0.265625, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271631240845, + "sampling/importance_sampling_ratio/min": 9.236609002982732e-06, + "sampling/sampling_logp_difference/max": 11.59233570098877, + "sampling/sampling_logp_difference/mean": 0.02008877694606781, + "step": 614 + }, + { + "clip_ratio/high_max": 5.6091539590852335e-06, + "clip_ratio/high_mean": 2.4549021873099264e-06, + "clip_ratio/low_mean": 4.249646542575647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4951367613066395e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13553.0, + "completions/mean_length": 8027.359375, + "completions/mean_terminated_length": 7470.25048828125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.9153474718332291, + "epoch": 0.5657773689052438, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0020656392443925142, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 542173801.0, + "reward": 0.2578125, + "reward_std": 0.22225633263587952, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999947190284729, + "sampling/importance_sampling_ratio/min": 0.00029620854184031487, + "sampling/sampling_logp_difference/max": 8.124446868896484, + "sampling/sampling_logp_difference/mean": 0.021495234221220016, + "step": 615 + }, + { + "clip_ratio/high_max": 1.7302586002188036e-05, + "clip_ratio/high_mean": 4.325646500547009e-06, + "clip_ratio/low_mean": 5.2193488272678223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6519134659538395e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6115.3828125, + "completions/mean_terminated_length": 5952.38916015625, + "completions/min_length": 1158.0, + "completions/min_terminated_length": 1158.0, + "entropy": 0.751783661544323, + "epoch": 0.5666973321067157, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00824788399040699, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 542977266.0, + "reward": 0.4609375, + "reward_std": 0.30616888403892517, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999478459358215, + "sampling/importance_sampling_ratio/min": 0.0013296925462782383, + "sampling/sampling_logp_difference/max": 6.622807502746582, + "sampling/sampling_logp_difference/mean": 0.017732972279191017, + "step": 616 + }, + { + "clip_ratio/high_max": 2.872588265745435e-05, + "clip_ratio/high_mean": 8.185486876755022e-06, + "clip_ratio/low_mean": 5.301810256241879e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.120358921180014e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15688.0, + "completions/mean_length": 7431.3203125, + "completions/mean_terminated_length": 7142.52392578125, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.9122852608561516, + "epoch": 0.5676172953081877, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005189655348658562, + "learning_rate": 1e-05, + "loss": 0.0613, + "num_tokens": 543947515.0, + "reward": 0.484375, + "reward_std": 0.21595832705497742, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.00017607140762265772, + "sampling/sampling_logp_difference/max": 8.644620895385742, + "sampling/sampling_logp_difference/mean": 0.02111673541367054, + "step": 617 + }, + { + "clip_ratio/high_max": 3.984698651038343e-06, + "clip_ratio/high_mean": 9.961746627595858e-07, + "clip_ratio/low_mean": 3.414959587644262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.514577088026272e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16378.0, + "completions/mean_length": 5700.5546875, + "completions/mean_terminated_length": 5530.9765625, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8961661159992218, + "epoch": 0.5685372585096596, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004707770887762308, + "learning_rate": 1e-05, + "loss": 0.0773, + "num_tokens": 544694826.0, + "reward": 0.4921875, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998490214347839, + "sampling/importance_sampling_ratio/min": 5.211461817644647e-10, + "sampling/sampling_logp_difference/max": 21.374990463256836, + "sampling/sampling_logp_difference/mean": 0.018697837367653847, + "step": 618 + }, + { + "clip_ratio/high_max": 1.1809721399913542e-05, + "clip_ratio/high_mean": 2.9524303499783855e-06, + "clip_ratio/low_mean": 5.229935004535946e-05, + "clip_ratio/low_min": 4.098226327187149e-06, + "clip_ratio/region_mean": 5.525178062271152e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12422.0, + "completions/max_terminated_length": 12422.0, + "completions/mean_length": 4201.6796875, + "completions/mean_terminated_length": 4201.6796875, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "entropy": 0.7066933363676071, + "epoch": 0.5694572217111316, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.00980924628674984, + "learning_rate": 1e-05, + "loss": 0.0492, + "num_tokens": 545255377.0, + "reward": 0.5625, + "reward_std": 0.38664889335632324, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000074028968811, + "sampling/importance_sampling_ratio/min": 7.827866647858173e-05, + "sampling/sampling_logp_difference/max": 9.455235481262207, + "sampling/sampling_logp_difference/mean": 0.016301468014717102, + "step": 619 + }, + { + "clip_ratio/high_max": 6.093102456361521e-06, + "clip_ratio/high_mean": 1.5232756140903803e-06, + "clip_ratio/low_mean": 1.853809601470857e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0061371856172627e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13234.0, + "completions/mean_length": 5782.2578125, + "completions/mean_terminated_length": 5613.9765625, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.846621498465538, + "epoch": 0.5703771849126035, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005619424395263195, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 546013882.0, + "reward": 0.46875, + "reward_std": 0.2472364753484726, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000319480895996, + "sampling/importance_sampling_ratio/min": 9.447568299947307e-05, + "sampling/sampling_logp_difference/max": 9.267168045043945, + "sampling/sampling_logp_difference/mean": 0.018704919144511223, + "step": 620 + }, + { + "clip_ratio/high_max": 1.6747734207456233e-05, + "clip_ratio/high_mean": 4.186933551864058e-06, + "clip_ratio/low_mean": 4.008232758678787e-05, + "clip_ratio/low_min": 3.511630438879365e-06, + "clip_ratio/region_mean": 4.426926193445979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 7191.4921875, + "completions/mean_terminated_length": 7045.57958984375, + "completions/min_length": 1379.0, + "completions/min_terminated_length": 1379.0, + "entropy": 0.7846563309431076, + "epoch": 0.5712971481140754, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0063271005637943745, + "learning_rate": 1e-05, + "loss": 0.0964, + "num_tokens": 546954857.0, + "reward": 0.4296875, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999164342880249, + "sampling/importance_sampling_ratio/min": 0.006330032367259264, + "sampling/sampling_logp_difference/max": 5.062449932098389, + "sampling/sampling_logp_difference/mean": 0.01846012845635414, + "step": 621 + }, + { + "clip_ratio/high_max": 3.451678094279487e-05, + "clip_ratio/high_mean": 1.2486661603361426e-05, + "clip_ratio/low_mean": 5.253966105556174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.502632390947838e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15529.0, + "completions/max_terminated_length": 15529.0, + "completions/mean_length": 5491.7421875, + "completions/mean_terminated_length": 5491.7421875, + "completions/min_length": 1644.0, + "completions/min_terminated_length": 1644.0, + "entropy": 0.6960643380880356, + "epoch": 0.5722171113155474, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005836677737534046, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 547676024.0, + "reward": 0.5625, + "reward_std": 0.43213340640068054, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999930739402771, + "sampling/importance_sampling_ratio/min": 0.00043176248436793685, + "sampling/sampling_logp_difference/max": 7.7476348876953125, + "sampling/sampling_logp_difference/mean": 0.016565188765525818, + "step": 622 + }, + { + "clip_ratio/high_max": 4.318982973927632e-06, + "clip_ratio/high_mean": 1.079745743481908e-06, + "clip_ratio/low_mean": 3.0399249226320535e-05, + "clip_ratio/low_min": 5.838393462909153e-06, + "clip_ratio/region_mean": 3.147899496980244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16179.0, + "completions/mean_length": 6993.125, + "completions/mean_terminated_length": 6844.06396484375, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "entropy": 0.8031502217054367, + "epoch": 0.5731370745170193, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00226933928206563, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 548590080.0, + "reward": 0.3984375, + "reward_std": 0.19332444667816162, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 1.1417677114877733e-06, + "sampling/sampling_logp_difference/max": 13.68293285369873, + "sampling/sampling_logp_difference/mean": 0.01880657486617565, + "step": 623 + }, + { + "clip_ratio/high_max": 8.404208529100288e-06, + "clip_ratio/high_mean": 2.101052132275072e-06, + "clip_ratio/low_mean": 4.231840989632474e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.441946202859981e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15278.0, + "completions/max_terminated_length": 15278.0, + "completions/mean_length": 5602.8359375, + "completions/mean_terminated_length": 5602.8359375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.8287182524800301, + "epoch": 0.5740570377184913, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005067484453320503, + "learning_rate": 1e-05, + "loss": 0.0394, + "num_tokens": 549327251.0, + "reward": 0.5, + "reward_std": 0.35218530893325806, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701380729675, + "sampling/importance_sampling_ratio/min": 0.0036069792695343494, + "sampling/sampling_logp_difference/max": 5.624884605407715, + "sampling/sampling_logp_difference/mean": 0.018545404076576233, + "step": 624 + }, + { + "clip_ratio/high_max": 7.49742275729659e-06, + "clip_ratio/high_mean": 1.8743556893241475e-06, + "clip_ratio/low_mean": 4.6288066641864134e-05, + "clip_ratio/low_min": 5.32640206074575e-06, + "clip_ratio/region_mean": 4.816242244487512e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15901.0, + "completions/mean_length": 6747.0234375, + "completions/mean_terminated_length": 6671.1416015625, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "entropy": 0.8722762316465378, + "epoch": 0.5749770009199632, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023132911883294582, + "learning_rate": 1e-05, + "loss": 0.0064, + "num_tokens": 550208750.0, + "reward": 0.390625, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999475479125977, + "sampling/importance_sampling_ratio/min": 0.003727440955117345, + "sampling/sampling_logp_difference/max": 5.592033386230469, + "sampling/sampling_logp_difference/mean": 0.019216621294617653, + "step": 625 + }, + { + "clip_ratio/high_max": 7.693567567912396e-06, + "clip_ratio/high_mean": 1.923391891978099e-06, + "clip_ratio/low_mean": 6.517495285152108e-05, + "clip_ratio/low_min": 1.1217302017030306e-05, + "clip_ratio/region_mean": 6.709834497087286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16027.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 6983.40625, + "completions/mean_terminated_length": 6983.40625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.8781512826681137, + "epoch": 0.5758969641214351, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036700034979730844, + "learning_rate": 1e-05, + "loss": 0.0905, + "num_tokens": 551123002.0, + "reward": 0.328125, + "reward_std": 0.2419992983341217, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999868273735046, + "sampling/importance_sampling_ratio/min": 5.0360464229015633e-05, + "sampling/sampling_logp_difference/max": 9.8963041305542, + "sampling/sampling_logp_difference/mean": 0.019318291917443275, + "step": 626 + }, + { + "clip_ratio/high_max": 5.098295332572889e-06, + "clip_ratio/high_mean": 1.2745738331432221e-06, + "clip_ratio/low_mean": 5.9073974398415885e-05, + "clip_ratio/low_min": 6.781316187698394e-06, + "clip_ratio/region_mean": 6.034854845893278e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16201.0, + "completions/mean_length": 7143.671875, + "completions/mean_terminated_length": 6689.22900390625, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.7715872526168823, + "epoch": 0.5768169273229071, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036717690527439117, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 552055472.0, + "reward": 0.3671875, + "reward_std": 0.2212003767490387, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998798966407776, + "sampling/importance_sampling_ratio/min": 0.00012340980174485594, + "sampling/sampling_logp_difference/max": 9.0, + "sampling/sampling_logp_difference/mean": 0.018518533557653427, + "step": 627 + }, + { + "clip_ratio/high_max": 1.778747127900715e-05, + "clip_ratio/high_mean": 4.4468678197517875e-06, + "clip_ratio/low_mean": 2.460010267668622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9046970439594588e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15729.0, + "completions/mean_length": 6558.5859375, + "completions/mean_terminated_length": 6075.36865234375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "entropy": 0.9016438648104668, + "epoch": 0.577736890524379, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019187588477507234, + "learning_rate": 1e-05, + "loss": 0.0494, + "num_tokens": 552914275.0, + "reward": 0.484375, + "reward_std": 0.2041158676147461, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999418258666992, + "sampling/importance_sampling_ratio/min": 0.00011496193474158645, + "sampling/sampling_logp_difference/max": 9.07090950012207, + "sampling/sampling_logp_difference/mean": 0.01948089525103569, + "step": 628 + }, + { + "clip_ratio/high_max": 1.383282506139949e-05, + "clip_ratio/high_mean": 3.4582062653498724e-06, + "clip_ratio/low_mean": 4.3287541757308645e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.674574802265852e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15812.0, + "completions/max_terminated_length": 15812.0, + "completions/mean_length": 6150.2734375, + "completions/mean_terminated_length": 6150.2734375, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.8385711833834648, + "epoch": 0.578656853725851, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003598993644118309, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 553719958.0, + "reward": 0.5078125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999948740005493, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.019557828083634377, + "step": 629 + }, + { + "clip_ratio/high_max": 2.668830120455823e-06, + "clip_ratio/high_mean": 6.672075301139557e-07, + "clip_ratio/low_mean": 1.7461135655594262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8128343185708218e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 8142.46875, + "completions/mean_terminated_length": 7519.16015625, + "completions/min_length": 1828.0, + "completions/min_terminated_length": 1828.0, + "entropy": 0.8508284538984299, + "epoch": 0.5795768169273229, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.002453390508890152, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 554784458.0, + "reward": 0.390625, + "reward_std": 0.1422954648733139, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999715089797974, + "sampling/importance_sampling_ratio/min": 0.0002036939695244655, + "sampling/sampling_logp_difference/max": 8.498891830444336, + "sampling/sampling_logp_difference/mean": 0.019445519894361496, + "step": 630 + }, + { + "clip_ratio/high_max": 1.9002460248884745e-05, + "clip_ratio/high_mean": 4.750615062221186e-06, + "clip_ratio/low_mean": 3.1556500402984966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630711614732718e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7665.921875, + "completions/mean_terminated_length": 7384.693359375, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.7667205557227135, + "epoch": 0.5804967801287948, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027936683036386967, + "learning_rate": 1e-05, + "loss": 0.0245, + "num_tokens": 555783296.0, + "reward": 0.4296875, + "reward_std": 0.24435830116271973, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998488426208496, + "sampling/importance_sampling_ratio/min": 0.0002781523216981441, + "sampling/sampling_logp_difference/max": 8.187341690063477, + "sampling/sampling_logp_difference/mean": 0.01912892609834671, + "step": 631 + }, + { + "clip_ratio/high_max": 1.5569996094200178e-05, + "clip_ratio/high_mean": 3.8924990235500445e-06, + "clip_ratio/low_mean": 3.8605214058407e-05, + "clip_ratio/low_min": 6.2870940382708795e-06, + "clip_ratio/region_mean": 4.249771222930576e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 7266.171875, + "completions/mean_terminated_length": 6972.04833984375, + "completions/min_length": 1117.0, + "completions/min_terminated_length": 1117.0, + "entropy": 0.7114122956991196, + "epoch": 0.5814167433302668, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004213637672364712, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 556732942.0, + "reward": 0.5390625, + "reward_std": 0.3135277032852173, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999159574508667, + "sampling/importance_sampling_ratio/min": 1.760348027346481e-06, + "sampling/sampling_logp_difference/max": 13.249999046325684, + "sampling/sampling_logp_difference/mean": 0.01689826510846615, + "step": 632 + }, + { + "clip_ratio/high_max": 2.1737864472015644e-05, + "clip_ratio/high_mean": 5.434466118003911e-06, + "clip_ratio/low_mean": 3.640393322257296e-05, + "clip_ratio/low_min": 3.0146634344419e-06, + "clip_ratio/region_mean": 4.183839985216764e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6532.9921875, + "completions/mean_terminated_length": 6296.568359375, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.7711968123912811, + "epoch": 0.5823367065317387, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004169877618551254, + "learning_rate": 1e-05, + "loss": 0.0406, + "num_tokens": 557589141.0, + "reward": 0.546875, + "reward_std": 0.2675113081932068, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999022483825684, + "sampling/importance_sampling_ratio/min": 4.499705482885474e-06, + "sampling/sampling_logp_difference/max": 12.311498641967773, + "sampling/sampling_logp_difference/mean": 0.018738210201263428, + "step": 633 + }, + { + "clip_ratio/high_max": 6.099523716329713e-06, + "clip_ratio/high_mean": 1.5248809290824283e-06, + "clip_ratio/low_mean": 6.070675681257853e-05, + "clip_ratio/low_min": 5.175126261747209e-06, + "clip_ratio/region_mean": 6.223163745744387e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16337.0, + "completions/mean_length": 7384.3203125, + "completions/mean_terminated_length": 7168.328125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.8054972141981125, + "epoch": 0.5832566697332107, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032470994628965855, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 558557286.0, + "reward": 0.4140625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999680519104004, + "sampling/importance_sampling_ratio/min": 0.00019634375348687172, + "sampling/sampling_logp_difference/max": 8.535643577575684, + "sampling/sampling_logp_difference/mean": 0.019018521532416344, + "step": 634 + }, + { + "clip_ratio/high_max": 4.436853964762122e-05, + "clip_ratio/high_mean": 1.1092134911905305e-05, + "clip_ratio/low_mean": 3.798940008437057e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.908153437099827e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15918.0, + "completions/mean_length": 6131.9453125, + "completions/mean_terminated_length": 6051.22021484375, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "entropy": 0.8365718051791191, + "epoch": 0.5841766329346826, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004848263692110777, + "learning_rate": 1e-05, + "loss": 0.1247, + "num_tokens": 559364639.0, + "reward": 0.5625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000056266784668, + "sampling/importance_sampling_ratio/min": 5.424115443020128e-06, + "sampling/sampling_logp_difference/max": 12.124655723571777, + "sampling/sampling_logp_difference/mean": 0.018360167741775513, + "step": 635 + }, + { + "clip_ratio/high_max": 1.9398633412492927e-05, + "clip_ratio/high_mean": 4.849658353123232e-06, + "clip_ratio/low_mean": 2.7543567512111622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239322609260853e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15724.0, + "completions/max_terminated_length": 15724.0, + "completions/mean_length": 5746.8828125, + "completions/mean_terminated_length": 5746.8828125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.6247628927230835, + "epoch": 0.5850965961361545, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003403177484869957, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 560119248.0, + "reward": 0.5390625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999486207962036, + "sampling/importance_sampling_ratio/min": 6.475952432083432e-07, + "sampling/sampling_logp_difference/max": 14.25, + "sampling/sampling_logp_difference/mean": 0.015006184577941895, + "step": 636 + }, + { + "clip_ratio/high_max": 2.857848289750109e-05, + "clip_ratio/high_mean": 8.111364707019675e-06, + "clip_ratio/low_mean": 4.927243321617425e-05, + "clip_ratio/low_min": 5.929088274569949e-06, + "clip_ratio/region_mean": 5.738379809372418e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7313.7890625, + "completions/mean_terminated_length": 7096.1044921875, + "completions/min_length": 1068.0, + "completions/min_terminated_length": 1068.0, + "entropy": 0.8606570512056351, + "epoch": 0.5860165593376265, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004058506805449724, + "learning_rate": 1e-05, + "loss": 0.093, + "num_tokens": 561072493.0, + "reward": 0.375, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0006621598731726408, + "sampling/sampling_logp_difference/max": 7.320003509521484, + "sampling/sampling_logp_difference/mean": 0.01940958946943283, + "step": 637 + }, + { + "clip_ratio/high_max": 2.7213282010052353e-05, + "clip_ratio/high_mean": 7.758043807370996e-06, + "clip_ratio/low_mean": 4.890350828645751e-05, + "clip_ratio/low_min": 3.968002147303196e-06, + "clip_ratio/region_mean": 5.666155129802064e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16093.0, + "completions/mean_length": 7495.5078125, + "completions/mean_terminated_length": 7425.51953125, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8225502669811249, + "epoch": 0.5869365225390984, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002768489997833967, + "learning_rate": 1e-05, + "loss": 0.098, + "num_tokens": 562048734.0, + "reward": 0.3671875, + "reward_std": 0.344813734292984, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 1.4612716768169776e-05, + "sampling/sampling_logp_difference/max": 11.133618354797363, + "sampling/sampling_logp_difference/mean": 0.0189508069306612, + "step": 638 + }, + { + "clip_ratio/high_max": 2.5246594077543705e-05, + "clip_ratio/high_mean": 6.311648519385926e-06, + "clip_ratio/low_mean": 4.9131452101391915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.544310107552519e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 6856.5703125, + "completions/mean_terminated_length": 6627.912109375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.8542520478367805, + "epoch": 0.5878564857405704, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002966079628095031, + "learning_rate": 1e-05, + "loss": 0.0507, + "num_tokens": 562945623.0, + "reward": 0.40625, + "reward_std": 0.3016803562641144, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998261332511902, + "sampling/importance_sampling_ratio/min": 0.0001795661955839023, + "sampling/sampling_logp_difference/max": 8.624966621398926, + "sampling/sampling_logp_difference/mean": 0.019664689898490906, + "step": 639 + }, + { + "clip_ratio/high_max": 1.2127683930884814e-05, + "clip_ratio/high_mean": 5.316983106240514e-06, + "clip_ratio/low_mean": 4.154238490627904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.685936778514588e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15231.0, + "completions/mean_length": 6463.2421875, + "completions/mean_terminated_length": 6305.77001953125, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.8427078947424889, + "epoch": 0.5887764489420423, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021058651618659496, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 563789214.0, + "reward": 0.3046875, + "reward_std": 0.24541424214839935, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998518824577332, + "sampling/importance_sampling_ratio/min": 0.00043074542190879583, + "sampling/sampling_logp_difference/max": 7.749993324279785, + "sampling/sampling_logp_difference/mean": 0.01898353546857834, + "step": 640 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 563789214, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-640/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-640/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-640/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/README.md b/dapo_milora_plus_20251201_131939/checkpoint-704/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/adapter_config.json b/dapo_milora_plus_20251201_131939/checkpoint-704/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6ce1ba3b0158d1d30476aa899f5fc31c4b27d76 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "up_proj", + "v_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/chat_template.jinja b/dapo_milora_plus_20251201_131939/checkpoint-704/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/latest b/dapo_milora_plus_20251201_131939/checkpoint-704/latest new file mode 100644 index 0000000000000000000000000000000000000000..a467c93394af75577cc1648673b23e2ec8a3f7c8 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/latest @@ -0,0 +1 @@ +global_step704 \ No newline at end of file diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/special_tokens_map.json b/dapo_milora_plus_20251201_131939/checkpoint-704/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/tokenizer_config.json b/dapo_milora_plus_20251201_131939/checkpoint-704/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/trainer_state.json b/dapo_milora_plus_20251201_131939/checkpoint-704/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..83274adc85c541af911ecf7654656966957f9fcd --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/trainer_state.json @@ -0,0 +1,21858 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6476540938362465, + "eval_steps": 500, + "global_step": 704, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004114801995456219, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 4.125957275391556e-06, + "clip_ratio/high_mean": 1.031489318847889e-06, + "clip_ratio/low_mean": 5.146006606082665e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.249155537967454e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15112.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 4978.265625, + "completions/mean_terminated_length": 4978.265625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.9862165078520775, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004017667844891548, + "learning_rate": 1e-05, + "loss": 0.0407, + "num_tokens": 1452816.0, + "reward": 0.3203125, + "reward_std": 0.30798622965812683, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.00840891432017088, + "sampling/sampling_logp_difference/max": 4.778462886810303, + "sampling/sampling_logp_difference/mean": 0.01879144087433815, + "step": 2 + }, + { + "clip_ratio/high_max": 5.936832167208195e-06, + "clip_ratio/high_mean": 1.4842080418020487e-06, + "clip_ratio/low_mean": 5.7621912446848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.910612048865005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 6664.3046875, + "completions/mean_terminated_length": 6587.771484375, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.9934953600168228, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002359058242291212, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 2324415.0, + "reward": 0.2890625, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000684261322021, + "sampling/importance_sampling_ratio/min": 0.0018158734310418367, + "sampling/sampling_logp_difference/max": 6.311188697814941, + "sampling/sampling_logp_difference/mean": 0.02111843228340149, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0255745564791141e-05, + "clip_ratio/high_mean": 2.5639363911977853e-06, + "clip_ratio/low_mean": 2.648322629283939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9047162797724013e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 5801.203125, + "completions/mean_terminated_length": 5717.8740234375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0870511680841446, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002563449554145336, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 3091369.0, + "reward": 0.2734375, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000073790550232, + "sampling/importance_sampling_ratio/min": 0.018811559304594994, + "sampling/sampling_logp_difference/max": 3.9732837677001953, + "sampling/sampling_logp_difference/mean": 0.021363306790590286, + "step": 4 + }, + { + "clip_ratio/high_max": 9.68160156844533e-06, + "clip_ratio/high_mean": 2.4204003921113326e-06, + "clip_ratio/low_mean": 4.577123684157414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8191637006311794e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15767.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 5696.4140625, + "completions/mean_terminated_length": 5696.4140625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.1476548686623573, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025258746463805437, + "learning_rate": 1e-05, + "loss": -0.0344, + "num_tokens": 3841078.0, + "reward": 0.3046875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910295009613, + "sampling/importance_sampling_ratio/min": 1.4871986877551535e-06, + "sampling/sampling_logp_difference/max": 13.41861629486084, + "sampling/sampling_logp_difference/mean": 0.020693503320217133, + "step": 5 + }, + { + "clip_ratio/high_max": 3.660332322397153e-05, + "clip_ratio/high_mean": 1.029715701861278e-05, + "clip_ratio/low_mean": 3.895585894042597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.925301630009926e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14489.0, + "completions/mean_length": 5280.890625, + "completions/mean_terminated_length": 5104.65087890625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.8976912423968315, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0050104837864637375, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 4535640.0, + "reward": 0.359375, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.0007187551236711442, + "sampling/sampling_logp_difference/max": 7.237989902496338, + "sampling/sampling_logp_difference/mean": 0.018597707152366638, + "step": 6 + }, + { + "clip_ratio/high_max": 4.484465989662567e-06, + "clip_ratio/high_mean": 1.1211164974156418e-06, + "clip_ratio/low_mean": 2.823482634539687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9355942729125672e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16166.0, + "completions/mean_length": 6874.9453125, + "completions/mean_terminated_length": 6568.20166015625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 1.0286128222942352, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018693821039050817, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 5434801.0, + "reward": 0.203125, + "reward_std": 0.21778544783592224, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999491572380066, + "sampling/importance_sampling_ratio/min": 5.279039783090411e-07, + "sampling/sampling_logp_difference/max": 14.454351425170898, + "sampling/sampling_logp_difference/mean": 0.020383886992931366, + "step": 7 + }, + { + "clip_ratio/high_max": 2.5703585606606794e-05, + "clip_ratio/high_mean": 7.537758676789963e-06, + "clip_ratio/low_mean": 5.802649661745818e-05, + "clip_ratio/low_min": 6.0229353948670905e-06, + "clip_ratio/region_mean": 6.556425523740472e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15910.0, + "completions/mean_length": 5270.5234375, + "completions/mean_terminated_length": 5094.119140625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 1.0461085885763168, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005809026304632425, + "learning_rate": 1e-05, + "loss": 0.0602, + "num_tokens": 6128708.0, + "reward": 0.3359375, + "reward_std": 0.37320882081985474, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 8.339863597939257e-06, + "sampling/sampling_logp_difference/max": 11.694463729858398, + "sampling/sampling_logp_difference/mean": 0.02038305625319481, + "step": 8 + }, + { + "clip_ratio/high_max": 3.965832502217381e-05, + "clip_ratio/high_mean": 1.2004183304270555e-05, + "clip_ratio/low_mean": 2.037043998370791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.237462271954428e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 4524.6796875, + "completions/mean_terminated_length": 4431.29931640625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.8275458142161369, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002717240946367383, + "learning_rate": 1e-05, + "loss": 0.1005, + "num_tokens": 6726587.0, + "reward": 0.4453125, + "reward_std": 0.32325831055641174, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.0002034705103142187, + "sampling/sampling_logp_difference/max": 8.49998950958252, + "sampling/sampling_logp_difference/mean": 0.017633724957704544, + "step": 9 + }, + { + "clip_ratio/high_max": 7.08802053850377e-06, + "clip_ratio/high_mean": 1.7720051346259424e-06, + "clip_ratio/low_mean": 3.394487077912345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571687602743623e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15243.0, + "completions/mean_length": 5129.171875, + "completions/mean_terminated_length": 4950.52392578125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.7103187441825867, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005626584868878126, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 7400273.0, + "reward": 0.6796875, + "reward_std": 0.379814088344574, + "rewards/accuracy_reward/mean": 0.6796875, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.002478840760886669, + "sampling/sampling_logp_difference/max": 5.999964237213135, + "sampling/sampling_logp_difference/mean": 0.016138140112161636, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.9378599517949624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9378599517949624e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15482.0, + "completions/max_terminated_length": 15482.0, + "completions/mean_length": 4741.296875, + "completions/mean_terminated_length": 4741.296875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.903806746006012, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0022279289551079273, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 8026991.0, + "reward": 0.3203125, + "reward_std": 0.1701665222644806, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.00012343087291810662, + "sampling/sampling_logp_difference/max": 8.999829292297363, + "sampling/sampling_logp_difference/mean": 0.01844138652086258, + "step": 11 + }, + { + "clip_ratio/high_max": 4.8331594371120445e-06, + "clip_ratio/high_mean": 1.93793562175415e-06, + "clip_ratio/low_mean": 4.368338659332949e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.562132153296261e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 5855.5546875, + "completions/mean_terminated_length": 5602.8720703125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 1.047883652150631, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038395742885768414, + "learning_rate": 1e-05, + "loss": 0.1298, + "num_tokens": 8797134.0, + "reward": 0.421875, + "reward_std": 0.3503503203392029, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999397397041321, + "sampling/importance_sampling_ratio/min": 0.0007607790757901967, + "sampling/sampling_logp_difference/max": 7.1811676025390625, + "sampling/sampling_logp_difference/mean": 0.02074582129716873, + "step": 12 + }, + { + "clip_ratio/high_max": 3.0723854251846205e-06, + "clip_ratio/high_mean": 7.680963562961551e-07, + "clip_ratio/low_mean": 1.2482652891776524e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.325074924807268e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6816.6953125, + "completions/mean_terminated_length": 6664.83349609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 1.1763990670442581, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0011414350010454655, + "learning_rate": 1e-05, + "loss": 0.0197, + "num_tokens": 9691639.0, + "reward": 0.25, + "reward_std": 0.1354655921459198, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998582601547241, + "sampling/importance_sampling_ratio/min": 1.8868423268258994e-08, + "sampling/sampling_logp_difference/max": 17.785776138305664, + "sampling/sampling_logp_difference/mean": 0.021673155948519707, + "step": 13 + }, + { + "clip_ratio/high_max": 1.3825085034113727e-05, + "clip_ratio/high_mean": 3.4562712585284316e-06, + "clip_ratio/low_mean": 2.299899915669812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6455270244696294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5313.53125, + "completions/mean_terminated_length": 5313.53125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 1.0474217981100082, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004565369803458452, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 10391515.0, + "reward": 0.296875, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998889565467834, + "sampling/importance_sampling_ratio/min": 2.431661960144993e-05, + "sampling/sampling_logp_difference/max": 10.624350547790527, + "sampling/sampling_logp_difference/mean": 0.020862173289060593, + "step": 14 + }, + { + "clip_ratio/high_max": 1.1656098649837077e-05, + "clip_ratio/high_mean": 2.914024662459269e-06, + "clip_ratio/low_mean": 6.22073393969913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.512136405945057e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14724.0, + "completions/mean_length": 4732.578125, + "completions/mean_terminated_length": 4640.83447265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 1.0815455242991447, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006024828180670738, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 11017781.0, + "reward": 0.25, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.00306904804892838, + "sampling/sampling_logp_difference/max": 5.786387920379639, + "sampling/sampling_logp_difference/mean": 0.020809629932045937, + "step": 15 + }, + { + "clip_ratio/high_max": 5.413130111264763e-06, + "clip_ratio/high_mean": 1.3532825278161908e-06, + "clip_ratio/low_mean": 2.816210690070875e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.951538942852494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5931.4296875, + "completions/mean_terminated_length": 5849.1259765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 1.0069087892770767, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036383175756782293, + "learning_rate": 1e-05, + "loss": -0.0001, + "num_tokens": 11794972.0, + "reward": 0.1875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 0.00028886934160254896, + "sampling/sampling_logp_difference/max": 8.1495361328125, + "sampling/sampling_logp_difference/mean": 0.019794823601841927, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.421858264118782e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.421858264118782e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14139.0, + "completions/mean_length": 5473.6171875, + "completions/mean_terminated_length": 5387.70849609375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 1.0765233263373375, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004312732256948948, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 12517443.0, + "reward": 0.2578125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999938428401947, + "sampling/importance_sampling_ratio/min": 1.0231680391825648e-07, + "sampling/sampling_logp_difference/max": 16.095191955566406, + "sampling/sampling_logp_difference/mean": 0.020093362778425217, + "step": 17 + }, + { + "clip_ratio/high_max": 2.0872469121968606e-05, + "clip_ratio/high_mean": 5.218117280492152e-06, + "clip_ratio/low_mean": 4.733878370188904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.255690120975487e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6617.7578125, + "completions/mean_terminated_length": 6137.45068359375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8550976514816284, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021260723005980253, + "learning_rate": 1e-05, + "loss": 0.1382, + "num_tokens": 13384420.0, + "reward": 0.3828125, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.02929881028831005, + "sampling/sampling_logp_difference/max": 3.5302083492279053, + "sampling/sampling_logp_difference/mean": 0.01808803342282772, + "step": 18 + }, + { + "clip_ratio/high_max": 6.404673058568733e-06, + "clip_ratio/high_mean": 1.6011682646421832e-06, + "clip_ratio/low_mean": 3.2195434073400975e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.379660131486162e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5285.7578125, + "completions/mean_terminated_length": 5109.595703125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.8321448192000389, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003563448553904891, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 14081197.0, + "reward": 0.375, + "reward_std": 0.31116873025894165, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998770952224731, + "sampling/importance_sampling_ratio/min": 0.000519682711455971, + "sampling/sampling_logp_difference/max": 7.562292098999023, + "sampling/sampling_logp_difference/mean": 0.017500173300504684, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.128390534991922e-05, + "clip_ratio/low_min": 1.2459845038392814e-05, + "clip_ratio/region_mean": 5.128390534991922e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13991.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 4918.1953125, + "completions/mean_terminated_length": 4918.1953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.9329824000597, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0048850164748728275, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 14727798.0, + "reward": 0.359375, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999402165412903, + "sampling/importance_sampling_ratio/min": 0.00041761461761780083, + "sampling/sampling_logp_difference/max": 7.780951499938965, + "sampling/sampling_logp_difference/mean": 0.01855182647705078, + "step": 20 + }, + { + "clip_ratio/high_max": 1.0364761692471802e-05, + "clip_ratio/high_mean": 2.5911904231179506e-06, + "clip_ratio/low_mean": 3.091395433330035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350514430167095e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16202.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 5268.5234375, + "completions/mean_terminated_length": 5268.5234375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.1676538437604904, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030562332831323147, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 15421937.0, + "reward": 0.28125, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.0016021198825910687, + "sampling/sampling_logp_difference/max": 6.436427593231201, + "sampling/sampling_logp_difference/mean": 0.021109789609909058, + "step": 21 + }, + { + "clip_ratio/high_max": 1.6653621514706174e-05, + "clip_ratio/high_mean": 4.1634053786765435e-06, + "clip_ratio/low_mean": 3.064284169340681e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.480624718577019e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5361.0703125, + "completions/mean_terminated_length": 5186.103515625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.9569757729768753, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003777366131544113, + "learning_rate": 1e-05, + "loss": 0.0058, + "num_tokens": 16128698.0, + "reward": 0.3359375, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.004354433622211218, + "sampling/sampling_logp_difference/max": 5.43656063079834, + "sampling/sampling_logp_difference/mean": 0.01940997503697872, + "step": 22 + }, + { + "clip_ratio/high_max": 3.227977140340954e-05, + "clip_ratio/high_mean": 9.227950727108691e-06, + "clip_ratio/low_mean": 4.881033578385541e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8038286169903586e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15623.0, + "completions/mean_length": 7428.3125, + "completions/mean_terminated_length": 6910.21435546875, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 1.0387683138251305, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005749945063143969, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 17101202.0, + "reward": 0.2734375, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.010671229101717472, + "sampling/sampling_logp_difference/max": 4.540204048156738, + "sampling/sampling_logp_difference/mean": 0.021208524703979492, + "step": 23 + }, + { + "clip_ratio/high_max": 2.544114977354184e-06, + "clip_ratio/high_mean": 6.36028744338546e-07, + "clip_ratio/low_mean": 4.543399086287536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6070018697719206e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15659.0, + "completions/mean_length": 5462.203125, + "completions/mean_terminated_length": 5288.841796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.088257022202015, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005364824552088976, + "learning_rate": 1e-05, + "loss": -0.0106, + "num_tokens": 17820796.0, + "reward": 0.1953125, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0013558369828388095, + "sampling/sampling_logp_difference/max": 6.603336334228516, + "sampling/sampling_logp_difference/mean": 0.020104583352804184, + "step": 24 + }, + { + "clip_ratio/high_max": 1.132360557676293e-05, + "clip_ratio/high_mean": 2.8309013941907324e-06, + "clip_ratio/low_mean": 3.686837260374887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96992739979396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 5423.234375, + "completions/mean_terminated_length": 5249.25439453125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.9123491793870926, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002392752794548869, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 18538546.0, + "reward": 0.3125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 1.657394705034676e-06, + "sampling/sampling_logp_difference/max": 13.310263633728027, + "sampling/sampling_logp_difference/mean": 0.02011517994105816, + "step": 25 + }, + { + "clip_ratio/high_max": 2.127026391463005e-05, + "clip_ratio/high_mean": 6.648429234701325e-06, + "clip_ratio/low_mean": 1.4927492088645522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.157592166440736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13978.0, + "completions/mean_length": 5574.1640625, + "completions/mean_terminated_length": 5489.04736328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 1.0090710371732712, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036540210712701082, + "learning_rate": 1e-05, + "loss": 0.0036, + "num_tokens": 19270439.0, + "reward": 0.3515625, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 0.00021918962011113763, + "sampling/sampling_logp_difference/max": 8.425573348999023, + "sampling/sampling_logp_difference/mean": 0.02006707340478897, + "step": 26 + }, + { + "clip_ratio/high_max": 1.1303152405162109e-05, + "clip_ratio/high_mean": 2.8257881012905273e-06, + "clip_ratio/low_mean": 2.827990363130084e-05, + "clip_ratio/low_min": 5.86744272368378e-06, + "clip_ratio/region_mean": 3.1105691391530854e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15061.0, + "completions/mean_length": 6204.75, + "completions/mean_terminated_length": 6124.5986328125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.957111045718193, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006005869247019291, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 20083655.0, + "reward": 0.3046875, + "reward_std": 0.31616854667663574, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549388885498, + "sampling/importance_sampling_ratio/min": 1.1864853988186042e-08, + "sampling/sampling_logp_difference/max": 18.249685287475586, + "sampling/sampling_logp_difference/mean": 0.0189923457801342, + "step": 27 + }, + { + "clip_ratio/high_max": 8.289213610623847e-06, + "clip_ratio/high_mean": 2.0723034026559617e-06, + "clip_ratio/low_mean": 3.4569659419503296e-05, + "clip_ratio/low_min": 3.6480373637459707e-06, + "clip_ratio/region_mean": 3.664196310637635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15274.0, + "completions/mean_length": 5659.5703125, + "completions/mean_terminated_length": 5489.341796875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.9482033550739288, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004129618871957064, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 20829064.0, + "reward": 0.3515625, + "reward_std": 0.3745690584182739, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 4.007936149719171e-05, + "sampling/sampling_logp_difference/max": 10.124649047851562, + "sampling/sampling_logp_difference/mean": 0.019232336431741714, + "step": 28 + }, + { + "clip_ratio/high_max": 1.3534072877519066e-05, + "clip_ratio/high_mean": 3.3835182193797664e-06, + "clip_ratio/low_mean": 2.9090757720950933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.247427605401754e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14558.0, + "completions/mean_length": 5122.9609375, + "completions/mean_terminated_length": 5034.29150390625, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 1.020588956773281, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004814058542251587, + "learning_rate": 1e-05, + "loss": 0.0994, + "num_tokens": 21505483.0, + "reward": 0.3359375, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.0003798597026616335, + "sampling/sampling_logp_difference/max": 7.87570858001709, + "sampling/sampling_logp_difference/mean": 0.019156761467456818, + "step": 29 + }, + { + "clip_ratio/high_max": 2.0903964468743652e-05, + "clip_ratio/high_mean": 5.225991117185913e-06, + "clip_ratio/low_mean": 4.13707307416189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.659672185880481e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15926.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 4833.734375, + "completions/mean_terminated_length": 4833.734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 1.0276868790388107, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006946730427443981, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 22142657.0, + "reward": 0.421875, + "reward_std": 0.250127375125885, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.000452048028819263, + "sampling/sampling_logp_difference/max": 7.701722145080566, + "sampling/sampling_logp_difference/mean": 0.019841451197862625, + "step": 30 + }, + { + "clip_ratio/high_max": 8.514986802765634e-06, + "clip_ratio/high_mean": 2.1287467006914085e-06, + "clip_ratio/low_mean": 3.9484380408794095e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.161312688211183e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6895.390625, + "completions/mean_terminated_length": 6589.30615234375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.1640124469995499, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0040768519975245, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 23045931.0, + "reward": 0.1484375, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603629112244, + "sampling/importance_sampling_ratio/min": 0.0006075318087823689, + "sampling/sampling_logp_difference/max": 7.406105995178223, + "sampling/sampling_logp_difference/mean": 0.02265278436243534, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.95245172057912e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95245172057912e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 6504.0625, + "completions/mean_terminated_length": 6347.23828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 1.1040372923016548, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004854958038777113, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 23899259.0, + "reward": 0.2265625, + "reward_std": 0.2619747221469879, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 3.380438373667971e-09, + "sampling/sampling_logp_difference/max": 19.505260467529297, + "sampling/sampling_logp_difference/mean": 0.020535167306661606, + "step": 32 + }, + { + "clip_ratio/high_max": 1.8890462797571672e-05, + "clip_ratio/high_mean": 4.722615699392918e-06, + "clip_ratio/low_mean": 4.095688700544997e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.567950259115605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14787.0, + "completions/mean_length": 5553.2578125, + "completions/mean_terminated_length": 5467.9765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 1.0357396975159645, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005473555997014046, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 24631956.0, + "reward": 0.4296875, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000377893447876, + "sampling/importance_sampling_ratio/min": 0.004898479674011469, + "sampling/sampling_logp_difference/max": 5.318830490112305, + "sampling/sampling_logp_difference/mean": 0.019490022212266922, + "step": 33 + }, + { + "clip_ratio/high_max": 8.08538152341498e-06, + "clip_ratio/high_mean": 2.021345380853745e-06, + "clip_ratio/low_mean": 2.4400278334724135e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6421623601891042e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15753.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 5357.46875, + "completions/mean_terminated_length": 5357.46875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0332984924316406, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003410332603380084, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 25336544.0, + "reward": 0.4453125, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.00010891074634855613, + "sampling/sampling_logp_difference/max": 9.124981880187988, + "sampling/sampling_logp_difference/mean": 0.01885366439819336, + "step": 34 + }, + { + "clip_ratio/high_max": 1.2948894436703995e-05, + "clip_ratio/high_mean": 3.2372236091759987e-06, + "clip_ratio/low_mean": 3.931040214411041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.25476254122259e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7812.8984375, + "completions/mean_terminated_length": 7745.4091796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.031004011631012, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003122704103589058, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 26355691.0, + "reward": 0.2890625, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.002222655341029167, + "sampling/sampling_logp_difference/max": 6.109052658081055, + "sampling/sampling_logp_difference/mean": 0.022181488573551178, + "step": 35 + }, + { + "clip_ratio/high_max": 1.3199577551858965e-05, + "clip_ratio/high_mean": 3.2998943879647413e-06, + "clip_ratio/low_mean": 3.742906312709238e-05, + "clip_ratio/low_min": 3.3127500955743017e-06, + "clip_ratio/region_mean": 4.072895751505712e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6402.6875, + "completions/mean_terminated_length": 5825.255859375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.96993837505579, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003292364301159978, + "learning_rate": 1e-05, + "loss": 0.0211, + "num_tokens": 27193267.0, + "reward": 0.375, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000267028808594, + "sampling/importance_sampling_ratio/min": 4.0287636693392415e-07, + "sampling/sampling_logp_difference/max": 14.72463607788086, + "sampling/sampling_logp_difference/mean": 0.019621271640062332, + "step": 36 + }, + { + "clip_ratio/high_max": 9.08137690203148e-06, + "clip_ratio/high_mean": 2.27034422550787e-06, + "clip_ratio/low_mean": 4.5394222524919314e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 4.766456731886137e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 7525.40625, + "completions/mean_terminated_length": 7165.30078125, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9819100275635719, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004192501772195101, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 28181183.0, + "reward": 0.3125, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439716339111, + "sampling/importance_sampling_ratio/min": 1.3309776477399282e-05, + "sampling/sampling_logp_difference/max": 11.227011680603027, + "sampling/sampling_logp_difference/mean": 0.019877666607499123, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.638440969349176e-05, + "clip_ratio/low_min": 6.698462129861582e-06, + "clip_ratio/region_mean": 2.638440969349176e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6930.8828125, + "completions/mean_terminated_length": 6625.943359375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9183463454246521, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029556062072515488, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 29087384.0, + "reward": 0.3359375, + "reward_std": 0.2740417718887329, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 4.6953626764434375e-08, + "sampling/sampling_logp_difference/max": 16.87410545349121, + "sampling/sampling_logp_difference/mean": 0.0197360310703516, + "step": 38 + }, + { + "clip_ratio/high_max": 2.4210238279920304e-05, + "clip_ratio/high_mean": 6.052559569980076e-06, + "clip_ratio/low_mean": 3.344960384765727e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9502163645011024e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 5893.1796875, + "completions/mean_terminated_length": 5726.6591796875, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 1.05657509714365, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0044409241527318954, + "learning_rate": 1e-05, + "loss": 0.0042, + "num_tokens": 29860767.0, + "reward": 0.2890625, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999690055847168, + "sampling/importance_sampling_ratio/min": 2.243226049358782e-07, + "sampling/sampling_logp_difference/max": 15.3101806640625, + "sampling/sampling_logp_difference/mean": 0.02058839052915573, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.4493159887460934e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4493159887460934e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13471.0, + "completions/max_terminated_length": 13471.0, + "completions/mean_length": 5779.4765625, + "completions/mean_terminated_length": 5779.4765625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.0302623957395554, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004552105907350779, + "learning_rate": 1e-05, + "loss": -0.0198, + "num_tokens": 30620388.0, + "reward": 0.3203125, + "reward_std": 0.3295513987541199, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.011830558069050312, + "sampling/sampling_logp_difference/max": 4.437069416046143, + "sampling/sampling_logp_difference/mean": 0.020457806065678596, + "step": 40 + }, + { + "clip_ratio/high_max": 9.270246664527804e-06, + "clip_ratio/high_mean": 2.317561666131951e-06, + "clip_ratio/low_mean": 3.615360617459373e-05, + "clip_ratio/low_min": 4.283315774955554e-06, + "clip_ratio/region_mean": 3.8471167840725684e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13927.0, + "completions/max_terminated_length": 13927.0, + "completions/mean_length": 5429.1328125, + "completions/mean_terminated_length": 5429.1328125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9245247691869736, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003300054930150509, + "learning_rate": 1e-05, + "loss": 0.1138, + "num_tokens": 31334221.0, + "reward": 0.4765625, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.00017977353127207607, + "sampling/sampling_logp_difference/max": 8.623812675476074, + "sampling/sampling_logp_difference/mean": 0.01882476732134819, + "step": 41 + }, + { + "clip_ratio/high_max": 1.5042513723528828e-05, + "clip_ratio/high_mean": 3.760628430882207e-06, + "clip_ratio/low_mean": 3.780993347390904e-05, + "clip_ratio/low_min": 3.7437480386870448e-06, + "clip_ratio/region_mean": 4.157056224585176e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14875.0, + "completions/mean_length": 6073.6328125, + "completions/mean_terminated_length": 5909.9765625, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 1.0127769336104393, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004679495934396982, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 32134854.0, + "reward": 0.359375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961793422699, + "sampling/importance_sampling_ratio/min": 0.0006151580018922687, + "sampling/sampling_logp_difference/max": 7.393631458282471, + "sampling/sampling_logp_difference/mean": 0.02106339856982231, + "step": 42 + }, + { + "clip_ratio/high_max": 1.8307343452761415e-05, + "clip_ratio/high_mean": 4.576835863190354e-06, + "clip_ratio/low_mean": 5.7316304378218774e-05, + "clip_ratio/low_min": 1.412125402566744e-05, + "clip_ratio/region_mean": 6.189314035509597e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15052.0, + "completions/max_terminated_length": 15052.0, + "completions/mean_length": 5773.015625, + "completions/mean_terminated_length": 5773.015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0045431107282639, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00485749589279294, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 32897040.0, + "reward": 0.46875, + "reward_std": 0.3595343232154846, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 4.862526111537591e-06, + "sampling/sampling_logp_difference/max": 12.233952522277832, + "sampling/sampling_logp_difference/mean": 0.01966444030404091, + "step": 43 + }, + { + "clip_ratio/high_max": 1.578610726937768e-05, + "clip_ratio/high_mean": 3.94652681734442e-06, + "clip_ratio/low_mean": 1.772546147549292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1671988179150503e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14572.0, + "completions/mean_length": 4731.3515625, + "completions/mean_terminated_length": 4639.5986328125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 1.0001292675733566, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004726089537143707, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 33522133.0, + "reward": 0.390625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999152421951294, + "sampling/importance_sampling_ratio/min": 0.0001548011932754889, + "sampling/sampling_logp_difference/max": 8.773368835449219, + "sampling/sampling_logp_difference/mean": 0.019276604056358337, + "step": 44 + }, + { + "clip_ratio/high_max": 7.944579010654707e-06, + "clip_ratio/high_mean": 1.9861447526636766e-06, + "clip_ratio/low_mean": 8.259907644969644e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.024605239763332e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15661.0, + "completions/mean_length": 6908.8984375, + "completions/mean_terminated_length": 6834.29150390625, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.0723063945770264, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0011808272683992982, + "learning_rate": 1e-05, + "loss": 0.0202, + "num_tokens": 34429384.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999626278877258, + "sampling/importance_sampling_ratio/min": 0.0007662919815629721, + "sampling/sampling_logp_difference/max": 7.173947334289551, + "sampling/sampling_logp_difference/mean": 0.021076666191220284, + "step": 45 + }, + { + "clip_ratio/high_max": 8.888357569958316e-06, + "clip_ratio/high_mean": 2.222089392489579e-06, + "clip_ratio/low_mean": 2.6357692036071967e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8579780860127357e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6679.140625, + "completions/mean_terminated_length": 6446.22412109375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9413202852010727, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003600373398512602, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 35302474.0, + "reward": 0.3203125, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 9.02900064829737e-05, + "sampling/sampling_logp_difference/max": 9.312483787536621, + "sampling/sampling_logp_difference/mean": 0.019808633252978325, + "step": 46 + }, + { + "clip_ratio/high_max": 9.364646757603623e-06, + "clip_ratio/high_mean": 2.3411616894009057e-06, + "clip_ratio/low_mean": 1.6833528775350715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9174690351064783e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16253.0, + "completions/mean_length": 5954.5859375, + "completions/mean_terminated_length": 5872.46435546875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 1.200403742492199, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003302425378933549, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 36093941.0, + "reward": 0.1640625, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998957514762878, + "sampling/importance_sampling_ratio/min": 0.0026806045789271593, + "sampling/sampling_logp_difference/max": 5.921712875366211, + "sampling/sampling_logp_difference/mean": 0.022528307512402534, + "step": 47 + }, + { + "clip_ratio/high_max": 1.2503618108894443e-05, + "clip_ratio/high_mean": 3.944288664570195e-06, + "clip_ratio/low_mean": 4.7836430894676596e-05, + "clip_ratio/low_min": 6.161485543998424e-06, + "clip_ratio/region_mean": 5.1780719331873115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 6109.1953125, + "completions/mean_terminated_length": 5946.103515625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9069097489118576, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005992463324218988, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 36893486.0, + "reward": 0.4921875, + "reward_std": 0.40373340249061584, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 2.4301432858919725e-05, + "sampling/sampling_logp_difference/max": 10.624975204467773, + "sampling/sampling_logp_difference/mean": 0.018979201093316078, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1075947440986056e-05, + "clip_ratio/high_mean": 2.768986860246514e-06, + "clip_ratio/low_mean": 2.73638818271138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.013286891473399e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6265.5390625, + "completions/mean_terminated_length": 6022.6962890625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9107594564557076, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005304713733494282, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 37716027.0, + "reward": 0.484375, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.0003461121814325452, + "sampling/sampling_logp_difference/max": 7.968747615814209, + "sampling/sampling_logp_difference/mean": 0.019227473065257072, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0917767667706357e-05, + "clip_ratio/high_mean": 3.674950448839809e-06, + "clip_ratio/low_mean": 3.135283236588293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.50277827010359e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15860.0, + "completions/mean_length": 6143.1796875, + "completions/mean_terminated_length": 5897.400390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.9168931543827057, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017410843865945935, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 38519738.0, + "reward": 0.3984375, + "reward_std": 0.2301519215106964, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998773336410522, + "sampling/importance_sampling_ratio/min": 0.0036513316445052624, + "sampling/sampling_logp_difference/max": 5.612663269042969, + "sampling/sampling_logp_difference/mean": 0.019512062892317772, + "step": 50 + }, + { + "clip_ratio/high_max": 5.4981305765977595e-06, + "clip_ratio/high_mean": 3.7445629459398333e-06, + "clip_ratio/low_mean": 2.6178069106208568e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.99226320521484e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15616.0, + "completions/mean_length": 7165.265625, + "completions/mean_terminated_length": 7092.67724609375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.9690218195319176, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004612576216459274, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 39461012.0, + "reward": 0.3125, + "reward_std": 0.35505855083465576, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 4.5421067625284195e-05, + "sampling/sampling_logp_difference/max": 9.999534606933594, + "sampling/sampling_logp_difference/mean": 0.0201116893440485, + "step": 51 + }, + { + "clip_ratio/high_max": 1.2953334362464375e-05, + "clip_ratio/high_mean": 3.2383335906160937e-06, + "clip_ratio/low_mean": 2.1866131419301382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5104465066760895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16007.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 5617.9296875, + "completions/mean_terminated_length": 5617.9296875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 1.0479632839560509, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003172830445691943, + "learning_rate": 1e-05, + "loss": -0.0235, + "num_tokens": 40202979.0, + "reward": 0.3828125, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 1.229221084031451e-06, + "sampling/sampling_logp_difference/max": 13.609129905700684, + "sampling/sampling_logp_difference/mean": 0.020904643461108208, + "step": 52 + }, + { + "clip_ratio/high_max": 1.4129082956060302e-05, + "clip_ratio/high_mean": 4.841006557398941e-06, + "clip_ratio/low_mean": 4.556761541607557e-05, + "clip_ratio/low_min": 8.631802302261349e-06, + "clip_ratio/region_mean": 5.040862197347451e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 5995.3203125, + "completions/mean_terminated_length": 5913.51953125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 1.022934041917324, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003333345288410783, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 40989532.0, + "reward": 0.3046875, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999282360076904, + "sampling/importance_sampling_ratio/min": 8.228168007917702e-05, + "sampling/sampling_logp_difference/max": 9.405362129211426, + "sampling/sampling_logp_difference/mean": 0.021745413541793823, + "step": 53 + }, + { + "clip_ratio/high_max": 2.286436574649997e-05, + "clip_ratio/high_mean": 6.531613848892448e-06, + "clip_ratio/low_mean": 3.960530659696815e-05, + "clip_ratio/low_min": 3.4269107800355414e-06, + "clip_ratio/region_mean": 4.6136920445860596e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15648.0, + "completions/mean_length": 6297.859375, + "completions/mean_terminated_length": 6055.79248046875, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9511058703064919, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005538261961191893, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 41813914.0, + "reward": 0.3671875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 0.00017344337538816035, + "sampling/sampling_logp_difference/max": 8.659659385681152, + "sampling/sampling_logp_difference/mean": 0.019708994776010513, + "step": 54 + }, + { + "clip_ratio/high_max": 4.575737420964288e-06, + "clip_ratio/high_mean": 1.143934355241072e-06, + "clip_ratio/low_mean": 2.561447990956367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6758414151117904e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14051.0, + "completions/max_terminated_length": 14051.0, + "completions/mean_length": 4765.046875, + "completions/mean_terminated_length": 4765.046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9130316227674484, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024318129289895296, + "learning_rate": 1e-05, + "loss": -0.0177, + "num_tokens": 42443288.0, + "reward": 0.4453125, + "reward_std": 0.29196253418922424, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 0.0001035423920257017, + "sampling/sampling_logp_difference/max": 9.175529479980469, + "sampling/sampling_logp_difference/mean": 0.01920286938548088, + "step": 55 + }, + { + "clip_ratio/high_max": 7.084159733494744e-06, + "clip_ratio/high_mean": 1.771039933373686e-06, + "clip_ratio/low_mean": 4.221943618176738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3990476115141064e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15445.0, + "completions/mean_length": 6411.5, + "completions/mean_terminated_length": 5834.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.8110766112804413, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018957280553877354, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 43287600.0, + "reward": 0.3984375, + "reward_std": 0.1990983933210373, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.0021892013028264046, + "sampling/sampling_logp_difference/max": 6.124218463897705, + "sampling/sampling_logp_difference/mean": 0.018554572016000748, + "step": 56 + }, + { + "clip_ratio/high_max": 6.7589489844976924e-06, + "clip_ratio/high_mean": 1.6897372461244231e-06, + "clip_ratio/low_mean": 4.334260950145108e-05, + "clip_ratio/low_min": 8.570448699174449e-06, + "clip_ratio/region_mean": 4.503234697494918e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15508.0, + "completions/mean_length": 6552.40625, + "completions/mean_terminated_length": 6235.2578125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 1.0034996420145035, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002431448083370924, + "learning_rate": 1e-05, + "loss": 0.0274, + "num_tokens": 44145524.0, + "reward": 0.25, + "reward_std": 0.3114011883735657, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999876081943512, + "sampling/importance_sampling_ratio/min": 0.051090992987155914, + "sampling/sampling_logp_difference/max": 2.974147081375122, + "sampling/sampling_logp_difference/mean": 0.020403606817126274, + "step": 57 + }, + { + "clip_ratio/high_max": 2.1032463337178342e-05, + "clip_ratio/high_mean": 6.422987098630983e-06, + "clip_ratio/low_mean": 1.0045687076853937e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.646867417548492e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13257.0, + "completions/mean_length": 4688.7734375, + "completions/mean_terminated_length": 4408.08837890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.9620971381664276, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004493447951972485, + "learning_rate": 1e-05, + "loss": 0.0344, + "num_tokens": 44763895.0, + "reward": 0.53125, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 3.1526888051303104e-05, + "sampling/sampling_logp_difference/max": 10.364669799804688, + "sampling/sampling_logp_difference/mean": 0.01916680857539177, + "step": 58 + }, + { + "clip_ratio/high_max": 3.076594612139161e-05, + "clip_ratio/high_mean": 7.691486530347902e-06, + "clip_ratio/low_mean": 2.8500278403953416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.619176493430132e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 5381.1875, + "completions/mean_terminated_length": 5294.55126953125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 1.0265433564782143, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0049595762975513935, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 45470335.0, + "reward": 0.4296875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998573660850525, + "sampling/importance_sampling_ratio/min": 1.6373864752949885e-07, + "sampling/sampling_logp_difference/max": 15.624994277954102, + "sampling/sampling_logp_difference/mean": 0.020656142383813858, + "step": 59 + }, + { + "clip_ratio/high_max": 2.6326441002311185e-05, + "clip_ratio/high_mean": 6.581610250577796e-06, + "clip_ratio/low_mean": 3.143254116366734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8014151868992485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15006.0, + "completions/mean_length": 5613.84375, + "completions/mean_terminated_length": 5529.03955078125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.0289503335952759, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00655899103730917, + "learning_rate": 1e-05, + "loss": 0.068, + "num_tokens": 46206971.0, + "reward": 0.3671875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 0.03283476456999779, + "sampling/sampling_logp_difference/max": 3.4162673950195312, + "sampling/sampling_logp_difference/mean": 0.020495962351560593, + "step": 60 + }, + { + "clip_ratio/high_max": 3.233557390558417e-05, + "clip_ratio/high_mean": 8.083893476396042e-06, + "clip_ratio/low_mean": 3.3687326776998816e-05, + "clip_ratio/low_min": 5.745277576352237e-06, + "clip_ratio/region_mean": 4.1771219912334345e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14763.0, + "completions/mean_length": 5577.2890625, + "completions/mean_terminated_length": 5492.19677734375, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.9836367890238762, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007459669373929501, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 46940112.0, + "reward": 0.4453125, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000065565109253, + "sampling/importance_sampling_ratio/min": 8.196697649509588e-07, + "sampling/sampling_logp_difference/max": 14.014364242553711, + "sampling/sampling_logp_difference/mean": 0.018994126468896866, + "step": 61 + }, + { + "clip_ratio/high_max": 1.720242698866059e-05, + "clip_ratio/high_mean": 4.300606747165148e-06, + "clip_ratio/low_mean": 3.032099141364597e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.462159838818479e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16273.0, + "completions/mean_length": 6547.140625, + "completions/mean_terminated_length": 6311.05615234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9028418883681297, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005132914055138826, + "learning_rate": 1e-05, + "loss": -0.0013, + "num_tokens": 47796514.0, + "reward": 0.46875, + "reward_std": 0.2751026153564453, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0005014563794247806, + "sampling/sampling_logp_difference/max": 7.597993850708008, + "sampling/sampling_logp_difference/mean": 0.02021491341292858, + "step": 62 + }, + { + "clip_ratio/high_max": 1.078213176697318e-05, + "clip_ratio/high_mean": 2.695532941743295e-06, + "clip_ratio/low_mean": 2.838153790207798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1077070843821275e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7409.3125, + "completions/mean_terminated_length": 6811.00048828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8614663332700729, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034147046972066164, + "learning_rate": 1e-05, + "loss": 0.0196, + "num_tokens": 48765386.0, + "reward": 0.3125, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 4.202586751489434e-06, + "sampling/sampling_logp_difference/max": 12.379810333251953, + "sampling/sampling_logp_difference/mean": 0.01943383738398552, + "step": 63 + }, + { + "clip_ratio/high_max": 1.153353150584735e-05, + "clip_ratio/high_mean": 2.8833828764618374e-06, + "clip_ratio/low_mean": 3.695166174111364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.98350443902018e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14591.0, + "completions/mean_length": 6420.859375, + "completions/mean_terminated_length": 6181.744140625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.9671022593975067, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004101228900253773, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 49606280.0, + "reward": 0.34375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.000259009946603328, + "sampling/sampling_logp_difference/max": 8.258644104003906, + "sampling/sampling_logp_difference/mean": 0.01929381489753723, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.231768923546042e-05, + "clip_ratio/low_min": 5.164009053260088e-06, + "clip_ratio/region_mean": 4.231768923546042e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14971.0, + "completions/mean_length": 4852.7578125, + "completions/mean_terminated_length": 4761.96044921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.9933939427137375, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0077895247377455235, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 50246457.0, + "reward": 0.40625, + "reward_std": 0.35400262475013733, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 2.4301109078805894e-05, + "sampling/sampling_logp_difference/max": 10.624988555908203, + "sampling/sampling_logp_difference/mean": 0.01895500347018242, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.009997408298659e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009997408298659e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15610.0, + "completions/max_terminated_length": 15610.0, + "completions/mean_length": 6840.03125, + "completions/mean_terminated_length": 6840.03125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.9959733113646507, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00207411777228117, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 51141597.0, + "reward": 0.28125, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 2.0275774659239687e-07, + "sampling/sampling_logp_difference/max": 15.411253929138184, + "sampling/sampling_logp_difference/mean": 0.02091015875339508, + "step": 66 + }, + { + "clip_ratio/high_max": 2.297391938554938e-05, + "clip_ratio/high_mean": 6.853683203189576e-06, + "clip_ratio/low_mean": 4.6152885829542356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3006569942226633e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15249.0, + "completions/mean_length": 6567.3828125, + "completions/mean_terminated_length": 6331.7841796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 1.0921807065606117, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006496666464954615, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 52001758.0, + "reward": 0.296875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.0026403397787362337, + "sampling/sampling_logp_difference/max": 5.936847686767578, + "sampling/sampling_logp_difference/mean": 0.021580250933766365, + "step": 67 + }, + { + "clip_ratio/high_max": 1.2290649465285242e-05, + "clip_ratio/high_mean": 3.0726623663213104e-06, + "clip_ratio/low_mean": 1.7558751551405294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0631413917726604e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16336.0, + "completions/mean_length": 6927.265625, + "completions/mean_terminated_length": 6542.84521484375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8170016556978226, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002272722776979208, + "learning_rate": 1e-05, + "loss": 0.021, + "num_tokens": 52907256.0, + "reward": 0.28125, + "reward_std": 0.22673700749874115, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 6.70690099013882e-08, + "sampling/sampling_logp_difference/max": 16.51754379272461, + "sampling/sampling_logp_difference/mean": 0.01844738982617855, + "step": 68 + }, + { + "clip_ratio/high_max": 1.016177520796191e-05, + "clip_ratio/high_mean": 4.526967131823767e-06, + "clip_ratio/low_mean": 5.522496246612718e-05, + "clip_ratio/low_min": 4.129910394112812e-06, + "clip_ratio/region_mean": 5.9751928688456246e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16228.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 5889.28125, + "completions/mean_terminated_length": 5889.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 1.0794919431209564, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005651532672345638, + "learning_rate": 1e-05, + "loss": 0.0382, + "num_tokens": 53682100.0, + "reward": 0.3046875, + "reward_std": 0.32613158226013184, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 4.226289718189946e-07, + "sampling/sampling_logp_difference/max": 14.67677116394043, + "sampling/sampling_logp_difference/mean": 0.020069826394319534, + "step": 69 + }, + { + "clip_ratio/high_max": 5.796966888738098e-06, + "clip_ratio/high_mean": 1.4492417221845244e-06, + "clip_ratio/low_mean": 4.575056436806335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.719980597656104e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5909.3125, + "completions/mean_terminated_length": 5394.16357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.8462172821164131, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002985857194289565, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 54456508.0, + "reward": 0.4296875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 0.000453252432635054, + "sampling/sampling_logp_difference/max": 7.699061393737793, + "sampling/sampling_logp_difference/mean": 0.01927822455763817, + "step": 70 + }, + { + "clip_ratio/high_max": 1.8927265045931563e-05, + "clip_ratio/high_mean": 5.821615673085034e-06, + "clip_ratio/low_mean": 3.1553636290482245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.737525207725412e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15579.0, + "completions/mean_length": 7465.3984375, + "completions/mean_terminated_length": 7177.701171875, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.8792542889714241, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036494233645498753, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 55429663.0, + "reward": 0.3671875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998987913131714, + "sampling/importance_sampling_ratio/min": 0.0017587440088391304, + "sampling/sampling_logp_difference/max": 6.343155384063721, + "sampling/sampling_logp_difference/mean": 0.01909823715686798, + "step": 71 + }, + { + "clip_ratio/high_max": 8.78609989740653e-06, + "clip_ratio/high_mean": 2.1965249743516324e-06, + "clip_ratio/low_mean": 3.611839565564878e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.831492040262674e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5674.9609375, + "completions/mean_terminated_length": 5590.6376953125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9117730036377907, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003951186314225197, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 56173314.0, + "reward": 0.4140625, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.003206930123269558, + "sampling/sampling_logp_difference/max": 5.742441177368164, + "sampling/sampling_logp_difference/mean": 0.01932360976934433, + "step": 72 + }, + { + "clip_ratio/high_max": 1.7587798083695816e-05, + "clip_ratio/high_mean": 5.872955512131739e-06, + "clip_ratio/low_mean": 4.657158876852918e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.244454393960041e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16325.0, + "completions/max_terminated_length": 16325.0, + "completions/mean_length": 4754.5390625, + "completions/mean_terminated_length": 4754.5390625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.8350499644875526, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.005329386796802282, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 56799911.0, + "reward": 0.515625, + "reward_std": 0.4111049771308899, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 8.575750689487904e-05, + "sampling/sampling_logp_difference/max": 9.36398696899414, + "sampling/sampling_logp_difference/mean": 0.01792578026652336, + "step": 73 + }, + { + "clip_ratio/high_max": 1.2575374057632871e-05, + "clip_ratio/high_mean": 3.1438435144082177e-06, + "clip_ratio/low_mean": 1.8536085917730816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1679929204765358e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 5744.2734375, + "completions/mean_terminated_length": 5488.92041015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8065197095274925, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036763548851013184, + "learning_rate": 1e-05, + "loss": 0.082, + "num_tokens": 57553986.0, + "reward": 0.515625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.00011362064105924219, + "sampling/sampling_logp_difference/max": 9.082645416259766, + "sampling/sampling_logp_difference/mean": 0.018098725005984306, + "step": 74 + }, + { + "clip_ratio/high_max": 1.877081149359583e-05, + "clip_ratio/high_mean": 6.101248914092139e-06, + "clip_ratio/low_mean": 2.6290458890798618e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239170769120392e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16088.0, + "completions/mean_length": 6754.5234375, + "completions/mean_terminated_length": 6523.41650390625, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 1.013127624988556, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038320303428918123, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 58438333.0, + "reward": 0.2890625, + "reward_std": 0.2369818389415741, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999582767486572, + "sampling/importance_sampling_ratio/min": 2.284922175022075e-06, + "sampling/sampling_logp_difference/max": 12.989178657531738, + "sampling/sampling_logp_difference/mean": 0.02173798717558384, + "step": 75 + }, + { + "clip_ratio/high_max": 1.9026635982299922e-05, + "clip_ratio/high_mean": 6.682960474790889e-06, + "clip_ratio/low_mean": 3.252214798976638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.920510800980992e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12421.0, + "completions/mean_length": 6203.5390625, + "completions/mean_terminated_length": 6123.3779296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 1.0302691981196404, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004985450301319361, + "learning_rate": 1e-05, + "loss": 0.0483, + "num_tokens": 59249562.0, + "reward": 0.421875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999778270721436, + "sampling/importance_sampling_ratio/min": 0.004553908482193947, + "sampling/sampling_logp_difference/max": 5.3917694091796875, + "sampling/sampling_logp_difference/mean": 0.019999932497739792, + "step": 76 + }, + { + "clip_ratio/high_max": 1.3964342088002013e-05, + "clip_ratio/high_mean": 3.4910855220005033e-06, + "clip_ratio/low_mean": 3.63567767180939e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.984786212640756e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 5727.796875, + "completions/mean_terminated_length": 5643.8896484375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.9781062752008438, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0037541294004768133, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 60001208.0, + "reward": 0.3828125, + "reward_std": 0.20753079652786255, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.00022466933296527714, + "sampling/sampling_logp_difference/max": 8.400880813598633, + "sampling/sampling_logp_difference/mean": 0.020555900409817696, + "step": 77 + }, + { + "clip_ratio/high_max": 2.7796927497547586e-06, + "clip_ratio/high_mean": 6.949231874386896e-07, + "clip_ratio/low_mean": 3.516969627526123e-05, + "clip_ratio/low_min": 4.025116595585132e-06, + "clip_ratio/region_mean": 3.586461934901308e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 5923.8359375, + "completions/mean_terminated_length": 5409.4013671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.9449758678674698, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.007178841158747673, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 60777899.0, + "reward": 0.3359375, + "reward_std": 0.2977364659309387, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.0004897661856375635, + "sampling/sampling_logp_difference/max": 7.621582508087158, + "sampling/sampling_logp_difference/mean": 0.019868161529302597, + "step": 78 + }, + { + "clip_ratio/high_max": 2.7642782697512303e-05, + "clip_ratio/high_mean": 9.016423746288638e-06, + "clip_ratio/low_mean": 4.3257180891487224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.227360486514954e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14267.0, + "completions/mean_length": 6171.640625, + "completions/mean_terminated_length": 5926.54443359375, + "completions/min_length": 721.0, + "completions/min_terminated_length": 721.0, + "entropy": 0.8597526922821999, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004776299465447664, + "learning_rate": 1e-05, + "loss": 0.0136, + "num_tokens": 61587141.0, + "reward": 0.46875, + "reward_std": 0.36113685369491577, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945878982544, + "sampling/importance_sampling_ratio/min": 4.0065449866233394e-05, + "sampling/sampling_logp_difference/max": 10.124996185302734, + "sampling/sampling_logp_difference/mean": 0.019484341144561768, + "step": 79 + }, + { + "clip_ratio/high_max": 4.145968978264136e-05, + "clip_ratio/high_mean": 1.036492244566034e-05, + "clip_ratio/low_mean": 3.6077020070024446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644194200409402e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15310.0, + "completions/mean_length": 5501.59375, + "completions/mean_terminated_length": 5415.9052734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9705724790692329, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007431659381836653, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 62308321.0, + "reward": 0.453125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 4.54318942502141e-05, + "sampling/sampling_logp_difference/max": 9.999296188354492, + "sampling/sampling_logp_difference/mean": 0.019636545330286026, + "step": 80 + }, + { + "clip_ratio/high_max": 4.327206170273712e-06, + "clip_ratio/high_mean": 1.081801542568428e-06, + "clip_ratio/low_mean": 6.429905033655814e-05, + "clip_ratio/low_min": 6.3626184783061035e-06, + "clip_ratio/region_mean": 6.538085153806605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5908.125, + "completions/mean_terminated_length": 5825.6376953125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8575867265462875, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005465450696647167, + "learning_rate": 1e-05, + "loss": 0.0797, + "num_tokens": 63084113.0, + "reward": 0.34375, + "reward_std": 0.39400771260261536, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999576210975647, + "sampling/importance_sampling_ratio/min": 4.766937126987614e-05, + "sampling/sampling_logp_difference/max": 9.951221466064453, + "sampling/sampling_logp_difference/mean": 0.018073562532663345, + "step": 81 + }, + { + "clip_ratio/high_max": 6.7512828536564484e-06, + "clip_ratio/high_mean": 1.6878207134141121e-06, + "clip_ratio/low_mean": 3.040744320514932e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.209526391856343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15981.0, + "completions/max_terminated_length": 15981.0, + "completions/mean_length": 4906.734375, + "completions/mean_terminated_length": 4906.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.9647495672106743, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003077819012105465, + "learning_rate": 1e-05, + "loss": -0.0104, + "num_tokens": 63740015.0, + "reward": 0.4375, + "reward_std": 0.2251344621181488, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091791152954, + "sampling/importance_sampling_ratio/min": 5.1233790145488456e-05, + "sampling/sampling_logp_difference/max": 9.879111289978027, + "sampling/sampling_logp_difference/mean": 0.01949312724173069, + "step": 82 + }, + { + "clip_ratio/high_max": 7.262375220307149e-06, + "clip_ratio/high_mean": 1.8155938050767872e-06, + "clip_ratio/low_mean": 3.626802561029763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8083618960627064e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 5402.78125, + "completions/mean_terminated_length": 5402.78125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.9809223562479019, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018245981773361564, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 64450515.0, + "reward": 0.265625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999257922172546, + "sampling/importance_sampling_ratio/min": 0.0009712215978652239, + "sampling/sampling_logp_difference/max": 6.93695592880249, + "sampling/sampling_logp_difference/mean": 0.019615523517131805, + "step": 83 + }, + { + "clip_ratio/high_max": 1.1774221320592915e-05, + "clip_ratio/high_mean": 2.9435553301482287e-06, + "clip_ratio/low_mean": 4.734331901090627e-05, + "clip_ratio/low_min": 1.1585900665522786e-05, + "clip_ratio/region_mean": 5.0286874625271594e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6198.703125, + "completions/mean_terminated_length": 5870.14501953125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.8571672514081001, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006053395569324493, + "learning_rate": 1e-05, + "loss": 0.0645, + "num_tokens": 65269285.0, + "reward": 0.421875, + "reward_std": 0.3464113473892212, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.0010333011159673333, + "sampling/sampling_logp_difference/max": 6.874996662139893, + "sampling/sampling_logp_difference/mean": 0.01869945600628853, + "step": 84 + }, + { + "clip_ratio/high_max": 2.7282983865006827e-05, + "clip_ratio/high_mean": 7.78695198278001e-06, + "clip_ratio/low_mean": 3.2358174394175876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0145126376955886e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6016.09375, + "completions/mean_terminated_length": 5851.52392578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.9883866459131241, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030623299535363913, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 66058473.0, + "reward": 0.3203125, + "reward_std": 0.24883407354354858, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999305009841919, + "sampling/importance_sampling_ratio/min": 0.0016286972677335143, + "sampling/sampling_logp_difference/max": 6.4199748039245605, + "sampling/sampling_logp_difference/mean": 0.02085939608514309, + "step": 85 + }, + { + "clip_ratio/high_max": 2.9797377010254422e-06, + "clip_ratio/high_mean": 7.449344252563606e-07, + "clip_ratio/low_mean": 3.9277208315979806e-05, + "clip_ratio/low_min": 4.51475443696836e-06, + "clip_ratio/region_mean": 4.002214268439275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 7072.53125, + "completions/mean_terminated_length": 6924.73046875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.0157204791903496, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038264680188149214, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 66984285.0, + "reward": 0.2890625, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.0020860559307038784, + "sampling/sampling_logp_difference/max": 6.17248010635376, + "sampling/sampling_logp_difference/mean": 0.021116644144058228, + "step": 86 + }, + { + "clip_ratio/high_max": 6.0717920860042796e-06, + "clip_ratio/high_mean": 1.5179480215010699e-06, + "clip_ratio/low_mean": 3.757404465432046e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.909199278950837e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6539.8203125, + "completions/mean_terminated_length": 6303.56005859375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0071343630552292, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0044838739559054375, + "learning_rate": 1e-05, + "loss": 0.0583, + "num_tokens": 67840310.0, + "reward": 0.390625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875426292419, + "sampling/importance_sampling_ratio/min": 0.001703627873212099, + "sampling/sampling_logp_difference/max": 6.374995231628418, + "sampling/sampling_logp_difference/mean": 0.020990263670682907, + "step": 87 + }, + { + "clip_ratio/high_max": 2.859010169231624e-05, + "clip_ratio/high_mean": 7.14752542307906e-06, + "clip_ratio/low_mean": 3.50394579982094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.218698381919239e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16224.0, + "completions/mean_length": 7204.09375, + "completions/mean_terminated_length": 6907.9677734375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "entropy": 0.9221752807497978, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034659637603908777, + "learning_rate": 1e-05, + "loss": -0.0057, + "num_tokens": 68782042.0, + "reward": 0.4140625, + "reward_std": 0.27958327531814575, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0003347320598550141, + "sampling/sampling_logp_difference/max": 8.002180099487305, + "sampling/sampling_logp_difference/mean": 0.02053149789571762, + "step": 88 + }, + { + "clip_ratio/high_max": 1.7420219137420645e-05, + "clip_ratio/high_mean": 4.355054784355161e-06, + "clip_ratio/low_mean": 2.086669928758056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.522175350350153e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14764.0, + "completions/mean_length": 5662.1640625, + "completions/mean_terminated_length": 5577.740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.9678512960672379, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024458845146000385, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 69526295.0, + "reward": 0.4375, + "reward_std": 0.18543373048305511, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.0033961546141654253, + "sampling/sampling_logp_difference/max": 5.6851115226745605, + "sampling/sampling_logp_difference/mean": 0.018346723169088364, + "step": 89 + }, + { + "clip_ratio/high_max": 8.09375796961831e-06, + "clip_ratio/high_mean": 2.0234394924045773e-06, + "clip_ratio/low_mean": 1.8629728629093734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0653167894124635e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16219.0, + "completions/mean_length": 5590.71875, + "completions/mean_terminated_length": 5505.732421875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9286820441484451, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004733253736048937, + "learning_rate": 1e-05, + "loss": 0.0719, + "num_tokens": 70262771.0, + "reward": 0.4609375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999312162399292, + "sampling/importance_sampling_ratio/min": 1.233097464137245e-05, + "sampling/sampling_logp_difference/max": 11.303396224975586, + "sampling/sampling_logp_difference/mean": 0.019460031762719154, + "step": 90 + }, + { + "clip_ratio/high_max": 1.8371122678217944e-05, + "clip_ratio/high_mean": 4.592780669554486e-06, + "clip_ratio/low_mean": 2.489819087259093e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.949097142845858e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6182.484375, + "completions/mean_terminated_length": 6102.1572265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 1.0872880518436432, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00468763243407011, + "learning_rate": 1e-05, + "loss": 0.0223, + "num_tokens": 71079953.0, + "reward": 0.2578125, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 9.611623681848869e-05, + "sampling/sampling_logp_difference/max": 9.24995231628418, + "sampling/sampling_logp_difference/mean": 0.02069907821714878, + "step": 91 + }, + { + "clip_ratio/high_max": 1.579416039021453e-05, + "clip_ratio/high_mean": 4.633066396309005e-06, + "clip_ratio/low_mean": 2.6412633246764017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1045699415699346e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 5909.2265625, + "completions/mean_terminated_length": 5826.748046875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9488153457641602, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034273737110197544, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 71856574.0, + "reward": 0.4140625, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998411536216736, + "sampling/importance_sampling_ratio/min": 0.00016871529805939645, + "sampling/sampling_logp_difference/max": 8.687297821044922, + "sampling/sampling_logp_difference/mean": 0.019539739936590195, + "step": 92 + }, + { + "clip_ratio/high_max": 3.7449817682499997e-06, + "clip_ratio/high_mean": 9.362454420624999e-07, + "clip_ratio/low_mean": 4.2946558664880286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.388280387956911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15933.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6381.3125, + "completions/mean_terminated_length": 6381.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.9708949401974678, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003149663796648383, + "learning_rate": 1e-05, + "loss": 0.0314, + "num_tokens": 72696806.0, + "reward": 0.3828125, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.00033631984842941165, + "sampling/sampling_logp_difference/max": 7.997447967529297, + "sampling/sampling_logp_difference/mean": 0.021038895472884178, + "step": 93 + }, + { + "clip_ratio/high_max": 6.492157353932271e-06, + "clip_ratio/high_mean": 1.6230393384830677e-06, + "clip_ratio/low_mean": 4.956343445883249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.118647413837607e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16180.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5726.03125, + "completions/mean_terminated_length": 5726.03125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.9100239053368568, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029015145264565945, + "learning_rate": 1e-05, + "loss": 0.0355, + "num_tokens": 73449210.0, + "reward": 0.3125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.0001686852192506194, + "sampling/sampling_logp_difference/max": 8.68747615814209, + "sampling/sampling_logp_difference/mean": 0.020026210695505142, + "step": 94 + }, + { + "clip_ratio/high_max": 1.199616144731408e-05, + "clip_ratio/high_mean": 2.99904036182852e-06, + "clip_ratio/low_mean": 1.4287397789303213e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7286438151131733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 5824.90625, + "completions/mean_terminated_length": 5824.90625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9244210943579674, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0045582144521176815, + "learning_rate": 1e-05, + "loss": 0.0387, + "num_tokens": 74212662.0, + "reward": 0.4375, + "reward_std": 0.24777324497699738, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0021414682269096375, + "sampling/sampling_logp_difference/max": 6.146263599395752, + "sampling/sampling_logp_difference/mean": 0.019039880484342575, + "step": 95 + }, + { + "clip_ratio/high_max": 1.010842470350326e-05, + "clip_ratio/high_mean": 2.527106175875815e-06, + "clip_ratio/low_mean": 4.0637585470904014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.316469153309299e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 6432.7265625, + "completions/mean_terminated_length": 6274.77001953125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.8756264597177505, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040692174807190895, + "learning_rate": 1e-05, + "loss": 0.0776, + "num_tokens": 75054003.0, + "reward": 0.4609375, + "reward_std": 0.35506343841552734, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998587369918823, + "sampling/importance_sampling_ratio/min": 0.005546991713345051, + "sampling/sampling_logp_difference/max": 5.194499492645264, + "sampling/sampling_logp_difference/mean": 0.019711513072252274, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6582903135240485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6582903135240485e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14589.0, + "completions/mean_length": 5474.6796875, + "completions/mean_terminated_length": 5388.779296875, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9279408678412437, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035478502977639437, + "learning_rate": 1e-05, + "loss": 0.1137, + "num_tokens": 75773194.0, + "reward": 0.546875, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000132322311401, + "sampling/importance_sampling_ratio/min": 0.004276251420378685, + "sampling/sampling_logp_difference/max": 5.454678535461426, + "sampling/sampling_logp_difference/mean": 0.018789665773510933, + "step": 97 + }, + { + "clip_ratio/high_max": 8.227548732975265e-06, + "clip_ratio/high_mean": 2.0568871832438163e-06, + "clip_ratio/low_mean": 4.1461861655989196e-05, + "clip_ratio/low_min": 3.5008122267754516e-06, + "clip_ratio/region_mean": 4.351874804342515e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 6730.2734375, + "completions/mean_terminated_length": 6577.0400390625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0115349367260933, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004816337022930384, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 76654837.0, + "reward": 0.40625, + "reward_std": 0.35325103998184204, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000003576278687, + "sampling/importance_sampling_ratio/min": 1.4203919818100985e-06, + "sampling/sampling_logp_difference/max": 13.464577674865723, + "sampling/sampling_logp_difference/mean": 0.021000642329454422, + "step": 98 + }, + { + "clip_ratio/high_max": 2.0833075723203365e-05, + "clip_ratio/high_mean": 5.208268930800841e-06, + "clip_ratio/low_mean": 2.399133984454238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.919960945746425e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14350.0, + "completions/mean_length": 4804.5859375, + "completions/mean_terminated_length": 4620.7861328125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.8622925356030464, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00695947976782918, + "learning_rate": 1e-05, + "loss": -0.0188, + "num_tokens": 77287704.0, + "reward": 0.5859375, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000001072883606, + "sampling/importance_sampling_ratio/min": 0.051502522081136703, + "sampling/sampling_logp_difference/max": 2.9661245346069336, + "sampling/sampling_logp_difference/mean": 0.019261913374066353, + "step": 99 + }, + { + "clip_ratio/high_max": 1.2886264812550507e-05, + "clip_ratio/high_mean": 3.221566203137627e-06, + "clip_ratio/low_mean": 3.53349669239833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8556532899747253e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15997.0, + "completions/mean_length": 5836.25, + "completions/mean_terminated_length": 5753.19677734375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.8808795213699341, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034830078948289156, + "learning_rate": 1e-05, + "loss": 0.1412, + "num_tokens": 78054048.0, + "reward": 0.484375, + "reward_std": 0.29036492109298706, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999365210533142, + "sampling/importance_sampling_ratio/min": 2.7803641842183424e-06, + "sampling/sampling_logp_difference/max": 12.792928695678711, + "sampling/sampling_logp_difference/mean": 0.01845550537109375, + "step": 100 + }, + { + "clip_ratio/high_max": 2.630969447636744e-05, + "clip_ratio/high_mean": 6.57742361909186e-06, + "clip_ratio/low_mean": 3.4728200375866436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1305623994958296e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13964.0, + "completions/mean_length": 5407.5703125, + "completions/mean_terminated_length": 5233.341796875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.9438152015209198, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028610217850655317, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 78765225.0, + "reward": 0.390625, + "reward_std": 0.26037710905075073, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 5.874436828889884e-05, + "sampling/sampling_logp_difference/max": 9.742315292358398, + "sampling/sampling_logp_difference/mean": 0.018839653581380844, + "step": 101 + }, + { + "clip_ratio/high_max": 1.2485550996643724e-05, + "clip_ratio/high_mean": 3.917444360013178e-06, + "clip_ratio/low_mean": 3.569766681721376e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961511060879275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15780.0, + "completions/mean_length": 6591.765625, + "completions/mean_terminated_length": 6436.33349609375, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.9185260459780693, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004098972305655479, + "learning_rate": 1e-05, + "loss": 0.0626, + "num_tokens": 79628691.0, + "reward": 0.40625, + "reward_std": 0.26932865381240845, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.001684795250184834, + "sampling/sampling_logp_difference/max": 6.386111259460449, + "sampling/sampling_logp_difference/mean": 0.02011241763830185, + "step": 102 + }, + { + "clip_ratio/high_max": 1.017276917991694e-05, + "clip_ratio/high_mean": 2.543192294979235e-06, + "clip_ratio/low_mean": 2.3897301389297354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.644049368427659e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16014.0, + "completions/mean_length": 6762.40625, + "completions/mean_terminated_length": 6371.2841796875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 1.0496173724532127, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003109709592536092, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 80513135.0, + "reward": 0.296875, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.0036795397754758596, + "sampling/sampling_logp_difference/max": 5.6049675941467285, + "sampling/sampling_logp_difference/mean": 0.021886618807911873, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0623295338518801e-05, + "clip_ratio/high_mean": 2.6558238346297003e-06, + "clip_ratio/low_mean": 5.0279177912671e-05, + "clip_ratio/low_min": 6.849113788121031e-06, + "clip_ratio/region_mean": 5.29350020315178e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15653.0, + "completions/mean_length": 8151.421875, + "completions/mean_terminated_length": 7528.79052734375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.8989155367016792, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050065224058926105, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 81579941.0, + "reward": 0.375, + "reward_std": 0.36691081523895264, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999760389328003, + "sampling/importance_sampling_ratio/min": 0.0007560441154055297, + "sampling/sampling_logp_difference/max": 7.187410831451416, + "sampling/sampling_logp_difference/mean": 0.02017449401319027, + "step": 104 + }, + { + "clip_ratio/high_max": 7.662745701964013e-06, + "clip_ratio/high_mean": 1.9156864254910033e-06, + "clip_ratio/low_mean": 4.2927287609018094e-05, + "clip_ratio/low_min": 4.201963292871369e-06, + "clip_ratio/region_mean": 4.484297357976175e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16094.0, + "completions/mean_length": 6871.7265625, + "completions/mean_terminated_length": 6643.43212890625, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "entropy": 1.006680078804493, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00352756236679852, + "learning_rate": 1e-05, + "loss": 0.0927, + "num_tokens": 82479474.0, + "reward": 0.3984375, + "reward_std": 0.33296146988868713, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000028133392334, + "sampling/importance_sampling_ratio/min": 0.023528963327407837, + "sampling/sampling_logp_difference/max": 3.749523162841797, + "sampling/sampling_logp_difference/mean": 0.021244853734970093, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6621729173493804e-05, + "clip_ratio/high_mean": 5.544197733797773e-06, + "clip_ratio/low_mean": 2.3860119426899473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9404316592263058e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 5705.6015625, + "completions/mean_terminated_length": 5705.6015625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9162084609270096, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002348776441067457, + "learning_rate": 1e-05, + "loss": -0.0169, + "num_tokens": 83229071.0, + "reward": 0.4453125, + "reward_std": 0.29249149560928345, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 1.176890145870857e-05, + "sampling/sampling_logp_difference/max": 11.35004997253418, + "sampling/sampling_logp_difference/mean": 0.01885361783206463, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.359476631383586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.359476631383586e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6823.90625, + "completions/mean_terminated_length": 6823.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 1.0139815732836723, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005057404283434153, + "learning_rate": 1e-05, + "loss": 0.038, + "num_tokens": 84119947.0, + "reward": 0.328125, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.014701711013913155, + "sampling/sampling_logp_difference/max": 4.219791412353516, + "sampling/sampling_logp_difference/mean": 0.021600374951958656, + "step": 107 + }, + { + "clip_ratio/high_max": 1.642513325350592e-05, + "clip_ratio/high_mean": 4.10628331337648e-06, + "clip_ratio/low_mean": 3.813199691649061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2238279775119736e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15516.0, + "completions/max_terminated_length": 15516.0, + "completions/mean_length": 5786.859375, + "completions/mean_terminated_length": 5786.859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 1.0515320897102356, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.008517255075275898, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 84879833.0, + "reward": 0.3671875, + "reward_std": 0.3311441242694855, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.00010231334454147145, + "sampling/sampling_logp_difference/max": 9.187470436096191, + "sampling/sampling_logp_difference/mean": 0.01993538998067379, + "step": 108 + }, + { + "clip_ratio/high_max": 7.0043706728029065e-06, + "clip_ratio/high_mean": 1.7510926682007266e-06, + "clip_ratio/low_mean": 1.4313530300569255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.606462308245682e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15836.0, + "completions/mean_length": 4726.2578125, + "completions/mean_terminated_length": 4634.46435546875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.795353539288044, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034334585070610046, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 85503162.0, + "reward": 0.6015625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000437498092651, + "sampling/importance_sampling_ratio/min": 0.0026589478366076946, + "sampling/sampling_logp_difference/max": 5.9298248291015625, + "sampling/sampling_logp_difference/mean": 0.018191032111644745, + "step": 109 + }, + { + "clip_ratio/high_max": 4.149239885009592e-06, + "clip_ratio/high_mean": 1.037309971252398e-06, + "clip_ratio/low_mean": 3.989860044839588e-05, + "clip_ratio/low_min": 4.927079316985328e-06, + "clip_ratio/region_mean": 4.093591041964828e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6450.140625, + "completions/mean_terminated_length": 5787.8837890625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.8920315206050873, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006242698058485985, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 86350364.0, + "reward": 0.359375, + "reward_std": 0.27540695667266846, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.00015162504860199988, + "sampling/sampling_logp_difference/max": 8.794099807739258, + "sampling/sampling_logp_difference/mean": 0.01948007568717003, + "step": 110 + }, + { + "clip_ratio/high_max": 4.065173015987966e-06, + "clip_ratio/high_mean": 1.8426849237584975e-06, + "clip_ratio/low_mean": 2.8560575628944207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0403260552702704e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15253.0, + "completions/mean_length": 6597.9453125, + "completions/mean_terminated_length": 6442.611328125, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "entropy": 0.9351271465420723, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002594202058389783, + "learning_rate": 1e-05, + "loss": 0.018, + "num_tokens": 87213277.0, + "reward": 0.34375, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998740553855896, + "sampling/importance_sampling_ratio/min": 0.007402713876217604, + "sampling/sampling_logp_difference/max": 4.905908584594727, + "sampling/sampling_logp_difference/mean": 0.02082553133368492, + "step": 111 + }, + { + "clip_ratio/high_max": 3.7528520806517918e-06, + "clip_ratio/high_mean": 9.382130201629479e-07, + "clip_ratio/low_mean": 4.297400278119312e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.391221568766923e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15955.0, + "completions/mean_length": 7109.9140625, + "completions/mean_terminated_length": 7036.8896484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.8797949478030205, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002718541072681546, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 88144530.0, + "reward": 0.3984375, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 0.0002329955023014918, + "sampling/sampling_logp_difference/max": 8.36449146270752, + "sampling/sampling_logp_difference/mean": 0.01960277371108532, + "step": 112 + }, + { + "clip_ratio/high_max": 1.1254821401962545e-05, + "clip_ratio/high_mean": 2.813705350490636e-06, + "clip_ratio/low_mean": 4.423825043886609e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7051955789356725e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 7361.6796875, + "completions/mean_terminated_length": 6513.427734375, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9020541086792946, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003097688313573599, + "learning_rate": 1e-05, + "loss": 0.0854, + "num_tokens": 89109897.0, + "reward": 0.359375, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998917579650879, + "sampling/importance_sampling_ratio/min": 0.0010758653515949845, + "sampling/sampling_logp_difference/max": 6.834630012512207, + "sampling/sampling_logp_difference/mean": 0.01997425965964794, + "step": 113 + }, + { + "clip_ratio/high_max": 1.8235970401292434e-05, + "clip_ratio/high_mean": 5.248351158115838e-06, + "clip_ratio/low_mean": 7.228819413285237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.753654563202872e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 5957.28125, + "completions/mean_terminated_length": 5620.935546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.8262394368648529, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0023438548669219017, + "learning_rate": 1e-05, + "loss": 0.0869, + "num_tokens": 89891429.0, + "reward": 0.421875, + "reward_std": 0.3713865876197815, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998185634613037, + "sampling/importance_sampling_ratio/min": 8.2406731962692e-06, + "sampling/sampling_logp_difference/max": 11.706428527832031, + "sampling/sampling_logp_difference/mean": 0.018976174294948578, + "step": 114 + }, + { + "clip_ratio/high_max": 1.6280149793601595e-05, + "clip_ratio/high_mean": 5.4644419833493885e-06, + "clip_ratio/low_mean": 5.1420432782833814e-05, + "clip_ratio/low_min": 6.1973228184797335e-06, + "clip_ratio/region_mean": 5.688487522093055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5405.53125, + "completions/mean_terminated_length": 5142.04833984375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.9246686547994614, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005619170609861612, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 90600721.0, + "reward": 0.421875, + "reward_std": 0.40821409225463867, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 7.91921266340978e-08, + "sampling/sampling_logp_difference/max": 16.351388931274414, + "sampling/sampling_logp_difference/mean": 0.01931554079055786, + "step": 115 + }, + { + "clip_ratio/high_max": 9.228460612575873e-06, + "clip_ratio/high_mean": 2.307115153143968e-06, + "clip_ratio/low_mean": 3.463903834699522e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.694615350013919e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6754.859375, + "completions/mean_terminated_length": 6363.4306640625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.952000230550766, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006043895613402128, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 91486063.0, + "reward": 0.3125, + "reward_std": 0.2527858018875122, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999067783355713, + "sampling/importance_sampling_ratio/min": 2.5612887384340866e-06, + "sampling/sampling_logp_difference/max": 12.875, + "sampling/sampling_logp_difference/mean": 0.02107170596718788, + "step": 116 + }, + { + "clip_ratio/high_max": 1.460792736907024e-05, + "clip_ratio/high_mean": 3.65198184226756e-06, + "clip_ratio/low_mean": 3.14642731495951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.511625499186266e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16377.0, + "completions/mean_length": 8135.8203125, + "completions/mean_terminated_length": 7869.75, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 1.0832853615283966, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00392121123149991, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 92546920.0, + "reward": 0.28125, + "reward_std": 0.2977413833141327, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 3.757069134735502e-05, + "sampling/sampling_logp_difference/max": 10.189286231994629, + "sampling/sampling_logp_difference/mean": 0.02211480587720871, + "step": 117 + }, + { + "clip_ratio/high_max": 3.585687591112219e-05, + "clip_ratio/high_mean": 8.964218977780547e-06, + "clip_ratio/low_mean": 3.652223790595599e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.548645733848389e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15497.0, + "completions/mean_length": 5476.53125, + "completions/mean_terminated_length": 5214.75244140625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.0261689275503159, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00480870483443141, + "learning_rate": 1e-05, + "loss": 0.0378, + "num_tokens": 93270524.0, + "reward": 0.46875, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.02749602682888508, + "sampling/sampling_logp_difference/max": 3.5937137603759766, + "sampling/sampling_logp_difference/mean": 0.01990744285285473, + "step": 118 + }, + { + "clip_ratio/high_max": 5.126732958160574e-06, + "clip_ratio/high_mean": 1.2816832395401434e-06, + "clip_ratio/low_mean": 3.6732255466631614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8013938819858595e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7670.0625, + "completions/mean_terminated_length": 7165.9501953125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.8719229996204376, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003411791054531932, + "learning_rate": 1e-05, + "loss": 0.0792, + "num_tokens": 94271404.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 2.125909531969228e-06, + "sampling/sampling_logp_difference/max": 13.061310768127441, + "sampling/sampling_logp_difference/mean": 0.01960139349102974, + "step": 119 + }, + { + "clip_ratio/high_max": 3.2011115308705484e-05, + "clip_ratio/high_mean": 1.0189622685174982e-05, + "clip_ratio/low_mean": 3.3884271260831156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4073893604945624e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15284.0, + "completions/mean_length": 5499.0859375, + "completions/mean_terminated_length": 5413.3779296875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8891193494200706, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0036615384742617607, + "learning_rate": 1e-05, + "loss": 0.0852, + "num_tokens": 94998263.0, + "reward": 0.4296875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.00029556488152593374, + "sampling/sampling_logp_difference/max": 8.126622200012207, + "sampling/sampling_logp_difference/mean": 0.01831059902906418, + "step": 120 + }, + { + "clip_ratio/high_max": 1.0020951322076144e-05, + "clip_ratio/high_mean": 2.505237830519036e-06, + "clip_ratio/low_mean": 3.4662164466681133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.716740218351333e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7831.1015625, + "completions/mean_terminated_length": 7410.466796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.9511109218001366, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003688640194013715, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 96020572.0, + "reward": 0.34375, + "reward_std": 0.266974538564682, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.0008284422219730914, + "sampling/sampling_logp_difference/max": 7.095963478088379, + "sampling/sampling_logp_difference/mean": 0.020766064524650574, + "step": 121 + }, + { + "clip_ratio/high_max": 4.31883336204919e-06, + "clip_ratio/high_mean": 1.0797083405122976e-06, + "clip_ratio/low_mean": 4.2512260733929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.359196918812813e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 7928.5, + "completions/mean_terminated_length": 7584.7802734375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 1.053833745419979, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002527788048610091, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 97055892.0, + "reward": 0.2734375, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999517560005188, + "sampling/importance_sampling_ratio/min": 8.097423233266454e-06, + "sampling/sampling_logp_difference/max": 11.72396469116211, + "sampling/sampling_logp_difference/mean": 0.02571871504187584, + "step": 122 + }, + { + "clip_ratio/high_max": 2.1440137970785145e-05, + "clip_ratio/high_mean": 5.360034492696286e-06, + "clip_ratio/low_mean": 5.3688914704252966e-05, + "clip_ratio/low_min": 1.0726187383625074e-05, + "clip_ratio/region_mean": 5.904894931063609e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15282.0, + "completions/mean_length": 7433.0, + "completions/mean_terminated_length": 7218.17626953125, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 1.0001763850450516, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004057250916957855, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 98026604.0, + "reward": 0.3046875, + "reward_std": 0.30274903774261475, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.0026400478091090918, + "sampling/sampling_logp_difference/max": 5.936958312988281, + "sampling/sampling_logp_difference/mean": 0.020892417058348656, + "step": 123 + }, + { + "clip_ratio/high_max": 7.200895424830378e-06, + "clip_ratio/high_mean": 1.8002238562075945e-06, + "clip_ratio/low_mean": 3.0267089357494115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.206731355476222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15435.0, + "completions/mean_length": 6529.8046875, + "completions/mean_terminated_length": 6211.92724609375, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.0204281583428383, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004367270041257143, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 98882667.0, + "reward": 0.421875, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 0.00013984176621306688, + "sampling/sampling_logp_difference/max": 8.874999046325684, + "sampling/sampling_logp_difference/mean": 0.020555414259433746, + "step": 124 + }, + { + "clip_ratio/high_max": 3.583304760468309e-06, + "clip_ratio/high_mean": 8.958261901170772e-07, + "clip_ratio/low_mean": 3.819216192368913e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.908798782958911e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 7329.9140625, + "completions/mean_terminated_length": 6806.12353515625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.8461082950234413, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014496444491669536, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 99847384.0, + "reward": 0.375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 2.1445715901791118e-05, + "sampling/sampling_logp_difference/max": 10.749985694885254, + "sampling/sampling_logp_difference/mean": 0.019216356799006462, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0716735232563224e-05, + "clip_ratio/high_mean": 2.679183808140806e-06, + "clip_ratio/low_mean": 3.4717084645308205e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7396268680822686e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15200.0, + "completions/mean_length": 6518.4765625, + "completions/mean_terminated_length": 6200.23388671875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.880072832107544, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006009541917592287, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 100699437.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 6.729899905622005e-05, + "sampling/sampling_logp_difference/max": 9.606365203857422, + "sampling/sampling_logp_difference/mean": 0.01985173299908638, + "step": 126 + }, + { + "clip_ratio/high_max": 7.563064400528674e-06, + "clip_ratio/high_mean": 1.8907661001321685e-06, + "clip_ratio/low_mean": 3.8401355027417594e-05, + "clip_ratio/low_min": 3.4494178180466406e-06, + "clip_ratio/region_mean": 4.029212129808002e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 8421.9296875, + "completions/mean_terminated_length": 8030.35205078125, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.929582305252552, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00254544778726995, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 101797124.0, + "reward": 0.2890625, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 2.139152456948068e-05, + "sampling/sampling_logp_difference/max": 10.75251579284668, + "sampling/sampling_logp_difference/mean": 0.020804740488529205, + "step": 127 + }, + { + "clip_ratio/high_max": 8.503243861923693e-06, + "clip_ratio/high_mean": 2.125810965480923e-06, + "clip_ratio/low_mean": 3.5734614471039094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7860425095459505e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14871.0, + "completions/mean_length": 6452.5859375, + "completions/mean_terminated_length": 6214.232421875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9258717745542526, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030309113208204508, + "learning_rate": 1e-05, + "loss": -0.0048, + "num_tokens": 102643751.0, + "reward": 0.4296875, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 2.0162780856480822e-05, + "sampling/sampling_logp_difference/max": 10.81167221069336, + "sampling/sampling_logp_difference/mean": 0.02046305686235428, + "step": 128 + }, + { + "clip_ratio/high_max": 1.4670421251139487e-05, + "clip_ratio/high_mean": 4.865382209118252e-06, + "clip_ratio/low_mean": 2.8848363626821083e-05, + "clip_ratio/low_min": 3.2798930078570265e-06, + "clip_ratio/region_mean": 3.371374566540908e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7655.140625, + "completions/mean_terminated_length": 7373.564453125, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "entropy": 1.1112212240695953, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028038588352501392, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 103645849.0, + "reward": 0.390625, + "reward_std": 0.24435339868068695, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.022794192656874657, + "sampling/sampling_logp_difference/max": 3.781249523162842, + "sampling/sampling_logp_difference/mean": 0.022147968411445618, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8828401809732895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8828401809732895e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 8166.765625, + "completions/mean_terminated_length": 7618.9501953125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.8589507639408112, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003990175202488899, + "learning_rate": 1e-05, + "loss": 0.0942, + "num_tokens": 104712987.0, + "reward": 0.4765625, + "reward_std": 0.2680353820323944, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 2.430162021482829e-05, + "sampling/sampling_logp_difference/max": 10.624967575073242, + "sampling/sampling_logp_difference/mean": 0.019254228100180626, + "step": 130 + }, + { + "clip_ratio/high_max": 7.719492032265407e-06, + "clip_ratio/high_mean": 1.9298730080663518e-06, + "clip_ratio/low_mean": 3.547307028384239e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7402943462439e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15444.0, + "completions/mean_length": 5872.40625, + "completions/mean_terminated_length": 5789.6376953125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.0606305003166199, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0038855294696986675, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 105481743.0, + "reward": 0.375, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 0.0018136304570361972, + "sampling/sampling_logp_difference/max": 6.312424659729004, + "sampling/sampling_logp_difference/mean": 0.021132031455636024, + "step": 131 + }, + { + "clip_ratio/high_max": 1.6221786609094124e-05, + "clip_ratio/high_mean": 5.614050223812228e-06, + "clip_ratio/low_mean": 4.114894863960217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6762998408667045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15742.0, + "completions/mean_length": 6474.9375, + "completions/mean_terminated_length": 6237.1201171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.8699874132871628, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004712321795523167, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 106333695.0, + "reward": 0.53125, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 4.115129559068009e-05, + "sampling/sampling_logp_difference/max": 10.098255157470703, + "sampling/sampling_logp_difference/mean": 0.019161570817232132, + "step": 132 + }, + { + "clip_ratio/high_max": 1.2752746897604084e-05, + "clip_ratio/high_mean": 3.188186724401021e-06, + "clip_ratio/low_mean": 2.881602637216929e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.200421309657031e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 6910.03125, + "completions/mean_terminated_length": 6604.4189453125, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.8597542196512222, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031310587655752897, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 107236363.0, + "reward": 0.4453125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000447034835815, + "sampling/importance_sampling_ratio/min": 0.0012788315070793033, + "sampling/sampling_logp_difference/max": 6.661808490753174, + "sampling/sampling_logp_difference/mean": 0.019823957234621048, + "step": 133 + }, + { + "clip_ratio/high_max": 1.2087368986613e-05, + "clip_ratio/high_mean": 3.02184224665325e-06, + "clip_ratio/low_mean": 3.179941927555774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.482126135168073e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6172.7421875, + "completions/mean_terminated_length": 5843.3466796875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.9560965895652771, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006233204621821642, + "learning_rate": 1e-05, + "loss": -0.0101, + "num_tokens": 108044714.0, + "reward": 0.4296875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0012860872084274888, + "sampling/sampling_logp_difference/max": 6.656150817871094, + "sampling/sampling_logp_difference/mean": 0.020428352057933807, + "step": 134 + }, + { + "clip_ratio/high_max": 1.846628038038034e-05, + "clip_ratio/high_mean": 4.616570095095085e-06, + "clip_ratio/low_mean": 3.8776780229454744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339335077929718e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 6232.4609375, + "completions/mean_terminated_length": 5988.82421875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.792289063334465, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005762661807239056, + "learning_rate": 1e-05, + "loss": 0.1106, + "num_tokens": 108862901.0, + "reward": 0.53125, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 0.004497833084315062, + "sampling/sampling_logp_difference/max": 5.4041595458984375, + "sampling/sampling_logp_difference/mean": 0.01772497221827507, + "step": 135 + }, + { + "clip_ratio/high_max": 1.1131890460092109e-05, + "clip_ratio/high_mean": 2.782972615023027e-06, + "clip_ratio/low_mean": 3.377504378931917e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.65580164043422e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15440.0, + "completions/mean_length": 5181.1015625, + "completions/mean_terminated_length": 5003.27783203125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.7691714614629745, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002916123950853944, + "learning_rate": 1e-05, + "loss": 0.088, + "num_tokens": 109544058.0, + "reward": 0.5625, + "reward_std": 0.3327339291572571, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 3.9317873756772315e-07, + "sampling/sampling_logp_difference/max": 14.749001502990723, + "sampling/sampling_logp_difference/mean": 0.017177307978272438, + "step": 136 + }, + { + "clip_ratio/high_max": 2.2183079636306502e-05, + "clip_ratio/high_mean": 5.5457699090766255e-06, + "clip_ratio/low_mean": 3.033036318811355e-05, + "clip_ratio/low_min": 3.5457974263408687e-06, + "clip_ratio/region_mean": 3.587613309719018e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15288.0, + "completions/mean_length": 5583.5859375, + "completions/mean_terminated_length": 5235.185546875, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.922084204852581, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035183338914066553, + "learning_rate": 1e-05, + "loss": 0.0303, + "num_tokens": 110282853.0, + "reward": 0.484375, + "reward_std": 0.24381661415100098, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.0021202145144343376, + "sampling/sampling_logp_difference/max": 6.156238079071045, + "sampling/sampling_logp_difference/mean": 0.01895858161151409, + "step": 137 + }, + { + "clip_ratio/high_max": 2.7135570235259365e-06, + "clip_ratio/high_mean": 6.783892558814841e-07, + "clip_ratio/low_mean": 2.520359919344628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.588198810826725e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 7191.71875, + "completions/mean_terminated_length": 6659.93359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.8676051273941994, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002201368333771825, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 111228449.0, + "reward": 0.296875, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.0047781821340322495, + "sampling/sampling_logp_difference/max": 5.343695163726807, + "sampling/sampling_logp_difference/mean": 0.01915489323437214, + "step": 138 + }, + { + "clip_ratio/high_max": 2.2828588043921627e-05, + "clip_ratio/high_mean": 7.982446049936698e-06, + "clip_ratio/low_mean": 4.164742210832628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962986872669717e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 5139.5859375, + "completions/mean_terminated_length": 4869.72021484375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.7077975794672966, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00720562506467104, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 111904700.0, + "reward": 0.5859375, + "reward_std": 0.3566659688949585, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 9.015951036417391e-06, + "sampling/sampling_logp_difference/max": 11.616515159606934, + "sampling/sampling_logp_difference/mean": 0.016763046383857727, + "step": 139 + }, + { + "clip_ratio/high_max": 1.3030461104790447e-05, + "clip_ratio/high_mean": 3.257615276197612e-06, + "clip_ratio/low_mean": 5.0197708333143964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.345532326828106e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15887.0, + "completions/mean_length": 7408.296875, + "completions/mean_terminated_length": 7118.7578125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8338208198547363, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005594039335846901, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 112873218.0, + "reward": 0.3828125, + "reward_std": 0.2806568741798401, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697804450989, + "sampling/importance_sampling_ratio/min": 5.832135502714664e-05, + "sampling/sampling_logp_difference/max": 9.749542236328125, + "sampling/sampling_logp_difference/mean": 0.018874341621994972, + "step": 140 + }, + { + "clip_ratio/high_max": 2.6347406674176455e-06, + "clip_ratio/high_mean": 6.586851668544114e-07, + "clip_ratio/low_mean": 3.066379792926455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132248309611896e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16158.0, + "completions/mean_length": 7637.25, + "completions/mean_terminated_length": 7131.2392578125, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.9943022206425667, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025491444393992424, + "learning_rate": 1e-05, + "loss": 0.089, + "num_tokens": 113869418.0, + "reward": 0.3046875, + "reward_std": 0.32641828060150146, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999139308929443, + "sampling/importance_sampling_ratio/min": 6.724766876686772e-07, + "sampling/sampling_logp_difference/max": 14.212298393249512, + "sampling/sampling_logp_difference/mean": 0.020018339157104492, + "step": 141 + }, + { + "clip_ratio/high_max": 1.7491673133918084e-05, + "clip_ratio/high_mean": 4.372918283479521e-06, + "clip_ratio/low_mean": 2.370427267806008e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8077190734165924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 6144.8671875, + "completions/mean_terminated_length": 6064.244140625, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.9252935722470284, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003907687962055206, + "learning_rate": 1e-05, + "loss": 0.1115, + "num_tokens": 114674257.0, + "reward": 0.5078125, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485611915588, + "sampling/importance_sampling_ratio/min": 0.003434742335230112, + "sampling/sampling_logp_difference/max": 5.673813343048096, + "sampling/sampling_logp_difference/mean": 0.018300339579582214, + "step": 142 + }, + { + "clip_ratio/high_max": 8.272644663520623e-06, + "clip_ratio/high_mean": 2.0681611658801557e-06, + "clip_ratio/low_mean": 2.688816772433711e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8956328833373846e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6266.6484375, + "completions/mean_terminated_length": 6186.984375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 1.0926234126091003, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0042014638893306255, + "learning_rate": 1e-05, + "loss": 0.023, + "num_tokens": 115496300.0, + "reward": 0.3671875, + "reward_std": 0.18884867429733276, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999801516532898, + "sampling/importance_sampling_ratio/min": 3.502686922729481e-06, + "sampling/sampling_logp_difference/max": 12.561980247497559, + "sampling/sampling_logp_difference/mean": 0.021998615935444832, + "step": 143 + }, + { + "clip_ratio/high_max": 2.7441840302344644e-05, + "clip_ratio/high_mean": 6.860460075586161e-06, + "clip_ratio/low_mean": 4.51459295618406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.200638997848728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 6392.890625, + "completions/mean_terminated_length": 6234.3017578125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.9028401970863342, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028757627587765455, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 116333286.0, + "reward": 0.4453125, + "reward_std": 0.35665616393089294, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 2.327528392243039e-07, + "sampling/sampling_logp_difference/max": 15.27328872680664, + "sampling/sampling_logp_difference/mean": 0.019069479778409004, + "step": 144 + }, + { + "clip_ratio/high_max": 1.216986993313185e-05, + "clip_ratio/high_mean": 3.0424674832829623e-06, + "clip_ratio/low_mean": 3.626850991622632e-05, + "clip_ratio/low_min": 4.492201696848497e-06, + "clip_ratio/region_mean": 3.931097762688296e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6300.1640625, + "completions/mean_terminated_length": 6220.763671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.110174722969532, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006776242982596159, + "learning_rate": 1e-05, + "loss": 0.0858, + "num_tokens": 117158619.0, + "reward": 0.3125, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998651742935181, + "sampling/importance_sampling_ratio/min": 6.477496299339691e-07, + "sampling/sampling_logp_difference/max": 14.249761581420898, + "sampling/sampling_logp_difference/mean": 0.022119753062725067, + "step": 145 + }, + { + "clip_ratio/high_max": 7.707248187216464e-06, + "clip_ratio/high_mean": 1.926812046804116e-06, + "clip_ratio/low_mean": 1.452984838579141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6456660432595527e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7482.25, + "completions/mean_terminated_length": 7340.95263671875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.9957183450460434, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003441061358898878, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 118140579.0, + "reward": 0.2109375, + "reward_std": 0.23250605165958405, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 1.1689271559589542e-05, + "sampling/sampling_logp_difference/max": 11.356839179992676, + "sampling/sampling_logp_difference/mean": 0.020916422829031944, + "step": 146 + }, + { + "clip_ratio/high_max": 1.3650881555804517e-05, + "clip_ratio/high_mean": 3.4127203889511293e-06, + "clip_ratio/low_mean": 4.652173765862244e-05, + "clip_ratio/low_min": 8.251542112702737e-06, + "clip_ratio/region_mean": 4.993445759282622e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6411.125, + "completions/mean_terminated_length": 6252.82568359375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9852773621678352, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035624606534838676, + "learning_rate": 1e-05, + "loss": 0.0601, + "num_tokens": 118982515.0, + "reward": 0.3984375, + "reward_std": 0.3913620114326477, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.018960632383823395, + "sampling/sampling_logp_difference/max": 3.96539044380188, + "sampling/sampling_logp_difference/mean": 0.020998675376176834, + "step": 147 + }, + { + "clip_ratio/high_max": 1.710706237645354e-05, + "clip_ratio/high_mean": 4.276765594113385e-06, + "clip_ratio/low_mean": 2.3662243620492518e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7939009100919066e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15146.0, + "completions/mean_length": 6640.75, + "completions/mean_terminated_length": 6326.45166015625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.8645239844918251, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004949269350618124, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 119851003.0, + "reward": 0.515625, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 5.8315905334893614e-05, + "sampling/sampling_logp_difference/max": 9.749635696411133, + "sampling/sampling_logp_difference/mean": 0.01905224658548832, + "step": 148 + }, + { + "clip_ratio/high_max": 5.033624802308623e-06, + "clip_ratio/high_mean": 2.0922732346662087e-06, + "clip_ratio/low_mean": 5.667686264132499e-05, + "clip_ratio/low_min": 3.2221478249994107e-06, + "clip_ratio/region_mean": 5.876913564861752e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6987.953125, + "completions/mean_terminated_length": 6444.3798828125, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9469119384884834, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005622676108032465, + "learning_rate": 1e-05, + "loss": 0.1008, + "num_tokens": 120765165.0, + "reward": 0.421875, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999727010726929, + "sampling/importance_sampling_ratio/min": 9.214873716700822e-05, + "sampling/sampling_logp_difference/max": 9.292106628417969, + "sampling/sampling_logp_difference/mean": 0.01969297230243683, + "step": 149 + }, + { + "clip_ratio/high_max": 4.223829364491394e-06, + "clip_ratio/high_mean": 1.8565209529697313e-06, + "clip_ratio/low_mean": 3.030186894648068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.21583895583899e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 7539.2265625, + "completions/mean_terminated_length": 6949.5751953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.983614593744278, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035846447572112083, + "learning_rate": 1e-05, + "loss": -0.0093, + "num_tokens": 121749426.0, + "reward": 0.3828125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 7.889377229730599e-06, + "sampling/sampling_logp_difference/max": 11.749993324279785, + "sampling/sampling_logp_difference/mean": 0.02050059661269188, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0533551176195033e-05, + "clip_ratio/high_mean": 2.6333877940487582e-06, + "clip_ratio/low_mean": 4.44662659901951e-05, + "clip_ratio/low_min": 5.9182802942814305e-06, + "clip_ratio/region_mean": 4.7099654238991207e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15528.0, + "completions/mean_length": 6339.5390625, + "completions/mean_terminated_length": 5845.548828125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.9051830619573593, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005333681590855122, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 122579975.0, + "reward": 0.34375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.0022614477202296257, + "sampling/sampling_logp_difference/max": 6.091750144958496, + "sampling/sampling_logp_difference/mean": 0.019756250083446503, + "step": 151 + }, + { + "clip_ratio/high_max": 5.961464921711013e-06, + "clip_ratio/high_mean": 1.4903662304277532e-06, + "clip_ratio/low_mean": 5.054293433204293e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2033300562470686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6610.8046875, + "completions/mean_terminated_length": 6533.8505859375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9121239259839058, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005628545768558979, + "learning_rate": 1e-05, + "loss": 0.1029, + "num_tokens": 123444686.0, + "reward": 0.5, + "reward_std": 0.3498311936855316, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.0027667356189340353, + "sampling/sampling_logp_difference/max": 5.890087127685547, + "sampling/sampling_logp_difference/mean": 0.019961554557085037, + "step": 152 + }, + { + "clip_ratio/high_max": 7.918152277852641e-06, + "clip_ratio/high_mean": 2.778689122351352e-06, + "clip_ratio/low_mean": 4.231535649523721e-05, + "clip_ratio/low_min": 3.3862490909086773e-06, + "clip_ratio/region_mean": 4.509404539021489e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15071.0, + "completions/mean_length": 7214.5546875, + "completions/mean_terminated_length": 6684.0908203125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.9393481463193893, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00521192466840148, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 124389325.0, + "reward": 0.25, + "reward_std": 0.26538968086242676, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.03890184313058853, + "sampling/sampling_logp_difference/max": 3.246713638305664, + "sampling/sampling_logp_difference/mean": 0.02030467614531517, + "step": 153 + }, + { + "clip_ratio/high_max": 1.3099364878144115e-05, + "clip_ratio/high_mean": 3.274841219536029e-06, + "clip_ratio/low_mean": 4.0359405488743505e-05, + "clip_ratio/low_min": 3.400342848181026e-06, + "clip_ratio/region_mean": 4.363424682196637e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15349.0, + "completions/mean_length": 7307.296875, + "completions/mean_terminated_length": 6938.32470703125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9287968128919601, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034769594203680754, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 125344827.0, + "reward": 0.390625, + "reward_std": 0.35035035014152527, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 5.1062532293144614e-05, + "sampling/sampling_logp_difference/max": 9.88245964050293, + "sampling/sampling_logp_difference/mean": 0.0197945274412632, + "step": 154 + }, + { + "clip_ratio/high_max": 6.428839697036892e-06, + "clip_ratio/high_mean": 1.607209924259223e-06, + "clip_ratio/low_mean": 3.123730675724801e-05, + "clip_ratio/low_min": 4.124868155486183e-06, + "clip_ratio/region_mean": 3.284451713625458e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 7255.5703125, + "completions/mean_terminated_length": 7110.6748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.9288185387849808, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005860861856490374, + "learning_rate": 1e-05, + "loss": 0.058, + "num_tokens": 126294060.0, + "reward": 0.3359375, + "reward_std": 0.29719966650009155, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.0017037172801792622, + "sampling/sampling_logp_difference/max": 6.374942779541016, + "sampling/sampling_logp_difference/mean": 0.019849762320518494, + "step": 155 + }, + { + "clip_ratio/high_max": 1.148841965914471e-05, + "clip_ratio/high_mean": 2.8721049147861777e-06, + "clip_ratio/low_mean": 4.209472854199703e-05, + "clip_ratio/low_min": 3.21056154461985e-06, + "clip_ratio/region_mean": 4.496683322940953e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16174.0, + "completions/mean_length": 6662.796875, + "completions/mean_terminated_length": 6429.48828125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.8072321340441704, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004109901376068592, + "learning_rate": 1e-05, + "loss": 0.0365, + "num_tokens": 127163746.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998985528945923, + "sampling/importance_sampling_ratio/min": 0.001930873841047287, + "sampling/sampling_logp_difference/max": 6.249782562255859, + "sampling/sampling_logp_difference/mean": 0.018542557954788208, + "step": 156 + }, + { + "clip_ratio/high_max": 1.4845912573946407e-05, + "clip_ratio/high_mean": 3.7114781434866018e-06, + "clip_ratio/low_mean": 3.845731936280572e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.216879796103967e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16168.0, + "completions/mean_length": 6638.5625, + "completions/mean_terminated_length": 6483.87353515625, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9228496253490448, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005437003914266825, + "learning_rate": 1e-05, + "loss": 0.1272, + "num_tokens": 128035690.0, + "reward": 0.4453125, + "reward_std": 0.325370192527771, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808073043823, + "sampling/importance_sampling_ratio/min": 0.0007831641123630106, + "sampling/sampling_logp_difference/max": 7.152168273925781, + "sampling/sampling_logp_difference/mean": 0.019497953355312347, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.087737986286811e-05, + "clip_ratio/low_min": 1.7309419035882456e-05, + "clip_ratio/region_mean": 5.087737986286811e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 8279.7890625, + "completions/mean_terminated_length": 7810.9501953125, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.9365477114915848, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004606325179338455, + "learning_rate": 1e-05, + "loss": 0.0553, + "num_tokens": 129114487.0, + "reward": 0.3359375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999224543571472, + "sampling/importance_sampling_ratio/min": 6.793912234570598e-06, + "sampling/sampling_logp_difference/max": 11.899483680725098, + "sampling/sampling_logp_difference/mean": 0.02114839106798172, + "step": 158 + }, + { + "clip_ratio/high_max": 2.8393386855896097e-05, + "clip_ratio/high_mean": 7.731617188255768e-06, + "clip_ratio/low_mean": 4.6293902641991735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.402551937550015e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15157.0, + "completions/mean_length": 6874.5546875, + "completions/mean_terminated_length": 6406.87646484375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.8596161976456642, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032216343097388744, + "learning_rate": 1e-05, + "loss": 0.0979, + "num_tokens": 130011934.0, + "reward": 0.46875, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.0028106109239161015, + "sampling/sampling_logp_difference/max": 5.874353408813477, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 159 + }, + { + "clip_ratio/high_max": 9.702946044853888e-06, + "clip_ratio/high_mean": 2.425736511213472e-06, + "clip_ratio/low_mean": 2.8597237701433187e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1022973985272984e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16193.0, + "completions/mean_length": 6554.3671875, + "completions/mean_terminated_length": 6154.78857421875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9097465947270393, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032756594009697437, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 130870045.0, + "reward": 0.453125, + "reward_std": 0.3006146550178528, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 9.237467679668043e-09, + "sampling/sampling_logp_difference/max": 18.499998092651367, + "sampling/sampling_logp_difference/mean": 0.019287925213575363, + "step": 160 + }, + { + "clip_ratio/high_max": 2.387705990258837e-05, + "clip_ratio/high_mean": 5.969264975647093e-06, + "clip_ratio/low_mean": 4.071546266004589e-05, + "clip_ratio/low_min": 2.701884795897058e-06, + "clip_ratio/region_mean": 4.6684727863066655e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 7199.9921875, + "completions/mean_terminated_length": 6903.73388671875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.9904173016548157, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003530750283971429, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 131812236.0, + "reward": 0.3125, + "reward_std": 0.30221718549728394, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999665021896362, + "sampling/importance_sampling_ratio/min": 2.5881658984872047e-06, + "sampling/sampling_logp_difference/max": 12.864561080932617, + "sampling/sampling_logp_difference/mean": 0.02212757244706154, + "step": 161 + }, + { + "clip_ratio/high_max": 1.924166053868248e-05, + "clip_ratio/high_mean": 4.81041513467062e-06, + "clip_ratio/low_mean": 4.526082898337336e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.007124354960979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 6822.59375, + "completions/mean_terminated_length": 6670.82568359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 1.0052980855107307, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004500554408878088, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 132711448.0, + "reward": 0.3203125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998499751091003, + "sampling/importance_sampling_ratio/min": 1.1307781022651398e-07, + "sampling/sampling_logp_difference/max": 15.995189666748047, + "sampling/sampling_logp_difference/mean": 0.02111719362437725, + "step": 162 + }, + { + "clip_ratio/high_max": 1.3326032785698771e-05, + "clip_ratio/high_mean": 3.331508196424693e-06, + "clip_ratio/low_mean": 1.9409651486057555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.274115956879541e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7792.9375, + "completions/mean_terminated_length": 7515.80615234375, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.9114394783973694, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020288117229938507, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 133729832.0, + "reward": 0.3671875, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821782112122, + "sampling/importance_sampling_ratio/min": 0.001612494932487607, + "sampling/sampling_logp_difference/max": 6.4299726486206055, + "sampling/sampling_logp_difference/mean": 0.020228523761034012, + "step": 163 + }, + { + "clip_ratio/high_max": 1.2359042557363864e-05, + "clip_ratio/high_mean": 3.089760639340966e-06, + "clip_ratio/low_mean": 2.9356229674704082e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.244599008667137e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15171.0, + "completions/mean_length": 5908.671875, + "completions/mean_terminated_length": 5826.18896484375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.9869658201932907, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006685085594654083, + "learning_rate": 1e-05, + "loss": -0.0036, + "num_tokens": 134507182.0, + "reward": 0.4609375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.0008160656434483826, + "sampling/sampling_logp_difference/max": 7.111015796661377, + "sampling/sampling_logp_difference/mean": 0.01997402310371399, + "step": 164 + }, + { + "clip_ratio/high_max": 8.511433406965807e-06, + "clip_ratio/high_mean": 2.1278583517414518e-06, + "clip_ratio/low_mean": 3.215114134036412e-05, + "clip_ratio/low_min": 3.941849627153715e-06, + "clip_ratio/region_mean": 3.427900014685292e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16000.0, + "completions/mean_length": 7188.0, + "completions/mean_terminated_length": 6735.7373046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.9519504383206367, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003854887094348669, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 135446382.0, + "reward": 0.4609375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998975992202759, + "sampling/importance_sampling_ratio/min": 0.0011354254093021154, + "sampling/sampling_logp_difference/max": 6.780747890472412, + "sampling/sampling_logp_difference/mean": 0.020226184278726578, + "step": 165 + }, + { + "clip_ratio/high_max": 7.114804702723632e-06, + "clip_ratio/high_mean": 1.778701175680908e-06, + "clip_ratio/low_mean": 1.9188738406228367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0967439695596113e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15712.0, + "completions/mean_length": 5843.5234375, + "completions/mean_terminated_length": 5676.21484375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.9677107483148575, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006265874952077866, + "learning_rate": 1e-05, + "loss": 0.0055, + "num_tokens": 136213233.0, + "reward": 0.296875, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513626098633, + "sampling/importance_sampling_ratio/min": 0.002176719717681408, + "sampling/sampling_logp_difference/max": 6.129936218261719, + "sampling/sampling_logp_difference/mean": 0.021706756204366684, + "step": 166 + }, + { + "clip_ratio/high_max": 5.9216449699306395e-06, + "clip_ratio/high_mean": 1.4804112424826599e-06, + "clip_ratio/low_mean": 2.429895857858355e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.910307100341015e-06, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 6942.15625, + "completions/mean_terminated_length": 6637.58056640625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 1.076062560081482, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0018056798726320267, + "learning_rate": 1e-05, + "loss": 0.0052, + "num_tokens": 137123405.0, + "reward": 0.2578125, + "reward_std": 0.172288179397583, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101161956787, + "sampling/importance_sampling_ratio/min": 0.022795137017965317, + "sampling/sampling_logp_difference/max": 3.781208038330078, + "sampling/sampling_logp_difference/mean": 0.02278529666364193, + "step": 167 + }, + { + "clip_ratio/high_max": 2.8421666684153024e-05, + "clip_ratio/high_mean": 8.364482027900522e-06, + "clip_ratio/low_mean": 4.042915224999888e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8793634050525725e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 7368.4609375, + "completions/mean_terminated_length": 7001.9755859375, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.9278362467885017, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132089575752616, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 138084464.0, + "reward": 0.421875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999951958656311, + "sampling/importance_sampling_ratio/min": 5.144971510162577e-05, + "sampling/sampling_logp_difference/max": 9.874905586242676, + "sampling/sampling_logp_difference/mean": 0.020028186962008476, + "step": 168 + }, + { + "clip_ratio/high_max": 6.84724363964051e-06, + "clip_ratio/high_mean": 1.7118109099101275e-06, + "clip_ratio/low_mean": 3.8177841361175524e-05, + "clip_ratio/low_min": 9.023873644764535e-06, + "clip_ratio/region_mean": 3.988965249845933e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 8278.578125, + "completions/mean_terminated_length": 8017.11279296875, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "entropy": 0.9731236174702644, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003180777421221137, + "learning_rate": 1e-05, + "loss": 0.0708, + "num_tokens": 139164722.0, + "reward": 0.296875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 4.579544565785909e-06, + "sampling/sampling_logp_difference/max": 12.29391098022461, + "sampling/sampling_logp_difference/mean": 0.020700933411717415, + "step": 169 + }, + { + "clip_ratio/high_max": 2.3081439849192975e-05, + "clip_ratio/high_mean": 7.712801448178652e-06, + "clip_ratio/low_mean": 4.41923687048984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.190517117625859e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 7237.2578125, + "completions/mean_terminated_length": 6865.43896484375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.7624354660511017, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004162010736763477, + "learning_rate": 1e-05, + "loss": 0.0955, + "num_tokens": 140109163.0, + "reward": 0.5078125, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 2.4133163606165908e-05, + "sampling/sampling_logp_difference/max": 10.63192367553711, + "sampling/sampling_logp_difference/mean": 0.017928704619407654, + "step": 170 + }, + { + "clip_ratio/high_max": 1.8008577626460465e-05, + "clip_ratio/high_mean": 4.502144406615116e-06, + "clip_ratio/low_mean": 2.0606968291758676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.510911281206063e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15857.0, + "completions/mean_length": 7307.4296875, + "completions/mean_terminated_length": 7089.59228515625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.9450376927852631, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003527693450450897, + "learning_rate": 1e-05, + "loss": 0.0442, + "num_tokens": 141063738.0, + "reward": 0.2890625, + "reward_std": 0.22673209011554718, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998871088027954, + "sampling/importance_sampling_ratio/min": 2.8798374842153862e-05, + "sampling/sampling_logp_difference/max": 10.455191612243652, + "sampling/sampling_logp_difference/mean": 0.021664291620254517, + "step": 171 + }, + { + "clip_ratio/high_max": 1.9155178961227648e-05, + "clip_ratio/high_mean": 4.788794740306912e-06, + "clip_ratio/low_mean": 3.323748410366534e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802627873028541e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 5985.9765625, + "completions/mean_terminated_length": 5736.42431640625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.8568939119577408, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002400327706709504, + "learning_rate": 1e-05, + "loss": 0.0778, + "num_tokens": 141848599.0, + "reward": 0.4921875, + "reward_std": 0.1922685205936432, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 1.3575387924902316e-08, + "sampling/sampling_logp_difference/max": 18.115007400512695, + "sampling/sampling_logp_difference/mean": 0.018963739275932312, + "step": 172 + }, + { + "clip_ratio/high_max": 1.6673273876222083e-05, + "clip_ratio/high_mean": 4.978134711564053e-06, + "clip_ratio/low_mean": 4.1565862602510606e-05, + "clip_ratio/low_min": 6.89249168317474e-06, + "clip_ratio/region_mean": 4.654399640457996e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 8078.8359375, + "completions/mean_terminated_length": 7810.92724609375, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.0634759217500687, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003575773909687996, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 142902666.0, + "reward": 0.3828125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999899864196777, + "sampling/importance_sampling_ratio/min": 1.8410922848488553e-06, + "sampling/sampling_logp_difference/max": 13.205151557922363, + "sampling/sampling_logp_difference/mean": 0.021685753017663956, + "step": 173 + }, + { + "clip_ratio/high_max": 1.2325835996307433e-05, + "clip_ratio/high_mean": 3.081458999076858e-06, + "clip_ratio/low_mean": 4.288118509521155e-05, + "clip_ratio/low_min": 7.69851726545312e-06, + "clip_ratio/region_mean": 4.596264443534892e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15876.0, + "completions/mean_length": 8138.515625, + "completions/mean_terminated_length": 7588.81689453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 1.0329038575291634, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003307635197415948, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 143967484.0, + "reward": 0.3203125, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.07909657061100006, + "sampling/sampling_logp_difference/max": 2.537085771560669, + "sampling/sampling_logp_difference/mean": 0.02233392372727394, + "step": 174 + }, + { + "clip_ratio/high_max": 2.3158392650657333e-05, + "clip_ratio/high_mean": 5.789598162664333e-06, + "clip_ratio/low_mean": 3.4071419804604375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986101773989503e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 8144.21875, + "completions/mean_terminated_length": 7878.4189453125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.9547601044178009, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022392498794943094, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 145028608.0, + "reward": 0.3515625, + "reward_std": 0.20411096513271332, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.9027246455370914e-06, + "sampling/sampling_logp_difference/max": 12.749860763549805, + "sampling/sampling_logp_difference/mean": 0.0203234925866127, + "step": 175 + }, + { + "clip_ratio/high_max": 1.330557415712974e-05, + "clip_ratio/high_mean": 3.326393539282435e-06, + "clip_ratio/low_mean": 3.57260964847228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.905248979663156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 6289.40625, + "completions/mean_terminated_length": 6129.1748046875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9483931511640549, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005551324691623449, + "learning_rate": 1e-05, + "loss": 0.085, + "num_tokens": 145851292.0, + "reward": 0.484375, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369382858276, + "sampling/importance_sampling_ratio/min": 0.0024864254519343376, + "sampling/sampling_logp_difference/max": 5.996909141540527, + "sampling/sampling_logp_difference/mean": 0.020259611308574677, + "step": 176 + }, + { + "clip_ratio/high_max": 8.344215302713565e-06, + "clip_ratio/high_mean": 2.086053825678391e-06, + "clip_ratio/low_mean": 5.073524926046957e-05, + "clip_ratio/low_min": 2.859953838196816e-06, + "clip_ratio/region_mean": 5.282130268824403e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 8855.9296875, + "completions/mean_terminated_length": 8354.05859375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 1.003264345228672, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038497373461723328, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 147004723.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.0003718819934874773, + "sampling/sampling_logp_difference/max": 7.8969340324401855, + "sampling/sampling_logp_difference/mean": 0.02178027108311653, + "step": 177 + }, + { + "clip_ratio/high_max": 1.2368503575999057e-05, + "clip_ratio/high_mean": 3.0921258939997642e-06, + "clip_ratio/low_mean": 4.947490833728807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.256703434497467e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16199.0, + "completions/mean_length": 7574.3359375, + "completions/mean_terminated_length": 7434.50048828125, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.9448538422584534, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005027150269597769, + "learning_rate": 1e-05, + "loss": 0.054, + "num_tokens": 147996190.0, + "reward": 0.359375, + "reward_std": 0.3316858410835266, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 2.846284814950195e-06, + "sampling/sampling_logp_difference/max": 12.769495964050293, + "sampling/sampling_logp_difference/mean": 0.020686112344264984, + "step": 178 + }, + { + "clip_ratio/high_max": 1.6756753666413715e-05, + "clip_ratio/high_mean": 4.189188416603429e-06, + "clip_ratio/low_mean": 3.363430948866153e-05, + "clip_ratio/low_min": 3.5745945297094295e-06, + "clip_ratio/region_mean": 3.7823498018951796e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 7162.5625, + "completions/mean_terminated_length": 6787.70703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.8928515017032623, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325182662345469, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 148931006.0, + "reward": 0.4609375, + "reward_std": 0.3492894768714905, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 1.7868870827442151e-07, + "sampling/sampling_logp_difference/max": 15.537620544433594, + "sampling/sampling_logp_difference/mean": 0.02043815702199936, + "step": 179 + }, + { + "clip_ratio/high_max": 2.08163191928179e-05, + "clip_ratio/high_mean": 5.204079798204475e-06, + "clip_ratio/low_mean": 2.8009484594804235e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3213564165635034e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 7958.2109375, + "completions/mean_terminated_length": 7396.4921875, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.8763524517416954, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003250610316172242, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 149968481.0, + "reward": 0.3046875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999974250793457, + "sampling/importance_sampling_ratio/min": 1.370981294712692e-06, + "sampling/sampling_logp_difference/max": 13.499983787536621, + "sampling/sampling_logp_difference/mean": 0.020478684455156326, + "step": 180 + }, + { + "clip_ratio/high_max": 1.4398233361134771e-05, + "clip_ratio/high_mean": 4.918068043480162e-06, + "clip_ratio/low_mean": 1.937760777082076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4295676269048272e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15728.0, + "completions/mean_length": 6120.296875, + "completions/mean_terminated_length": 5789.20947265625, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7507334873080254, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004865634720772505, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 150768791.0, + "reward": 0.5703125, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999062418937683, + "sampling/importance_sampling_ratio/min": 3.535915311658755e-05, + "sampling/sampling_logp_difference/max": 10.249953269958496, + "sampling/sampling_logp_difference/mean": 0.01739395596086979, + "step": 181 + }, + { + "clip_ratio/high_max": 1.170663267657801e-05, + "clip_ratio/high_mean": 2.9266581691445026e-06, + "clip_ratio/low_mean": 5.480891331899329e-05, + "clip_ratio/low_min": 9.078275525098434e-06, + "clip_ratio/region_mean": 5.773557131760754e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 8231.671875, + "completions/mean_terminated_length": 7230.5087890625, + "completions/min_length": 1231.0, + "completions/min_terminated_length": 1231.0, + "entropy": 0.8613645136356354, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027805580757558346, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 151844301.0, + "reward": 0.34375, + "reward_std": 0.35088711977005005, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957799911499, + "sampling/importance_sampling_ratio/min": 0.0015732402680441737, + "sampling/sampling_logp_difference/max": 6.454617977142334, + "sampling/sampling_logp_difference/mean": 0.019971080124378204, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0858868336072192e-05, + "clip_ratio/high_mean": 2.714717084018048e-06, + "clip_ratio/low_mean": 4.333486742780224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.60495848528808e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15682.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6038.4921875, + "completions/mean_terminated_length": 6038.4921875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.8801494240760803, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028903940692543983, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 152638356.0, + "reward": 0.5234375, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019503593445, + "sampling/importance_sampling_ratio/min": 4.2232054511259776e-06, + "sampling/sampling_logp_difference/max": 12.374916076660156, + "sampling/sampling_logp_difference/mean": 0.019382324069738388, + "step": 183 + }, + { + "clip_ratio/high_max": 7.320573104152572e-06, + "clip_ratio/high_mean": 1.830143276038143e-06, + "clip_ratio/low_mean": 4.994629193788569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.177643492970674e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16235.0, + "completions/mean_length": 7506.921875, + "completions/mean_terminated_length": 7070.34375, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.8713229671120644, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029546513687819242, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 153618418.0, + "reward": 0.3828125, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 4.4900667717229226e-07, + "sampling/sampling_logp_difference/max": 14.616228103637695, + "sampling/sampling_logp_difference/mean": 0.01928526908159256, + "step": 184 + }, + { + "clip_ratio/high_max": 1.2992590200155973e-05, + "clip_ratio/high_mean": 3.2481475500389934e-06, + "clip_ratio/low_mean": 2.8494011758084525e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.174215930812352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13999.0, + "completions/mean_length": 6725.921875, + "completions/mean_terminated_length": 6649.8740234375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.9011344686150551, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002384800696745515, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 154502440.0, + "reward": 0.46875, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.0008398547652177513, + "sampling/sampling_logp_difference/max": 7.082281589508057, + "sampling/sampling_logp_difference/mean": 0.020737573504447937, + "step": 185 + }, + { + "clip_ratio/high_max": 2.686360085135675e-05, + "clip_ratio/high_mean": 7.414224342028319e-06, + "clip_ratio/low_mean": 3.7723172567893926e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5137397364669596e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15862.0, + "completions/mean_length": 7285.78125, + "completions/mean_terminated_length": 6992.2900390625, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "entropy": 1.028538629412651, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033664393704384565, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 155454988.0, + "reward": 0.296875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.0003808041801676154, + "sampling/sampling_logp_difference/max": 7.873225212097168, + "sampling/sampling_logp_difference/mean": 0.022076331079006195, + "step": 186 + }, + { + "clip_ratio/high_max": 1.1637549050647067e-05, + "clip_ratio/high_mean": 2.9093872626617667e-06, + "clip_ratio/low_mean": 3.613749231590191e-05, + "clip_ratio/low_min": 6.27866324975912e-06, + "clip_ratio/region_mean": 3.904687946487684e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 7546.1015625, + "completions/mean_terminated_length": 6956.90869140625, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.9216663613915443, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029569920152425766, + "learning_rate": 1e-05, + "loss": 0.0995, + "num_tokens": 156439609.0, + "reward": 0.390625, + "reward_std": 0.305637001991272, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.009956372901797295, + "sampling/sampling_logp_difference/max": 4.609542369842529, + "sampling/sampling_logp_difference/mean": 0.021088771522045135, + "step": 187 + }, + { + "clip_ratio/high_max": 6.485023732238915e-06, + "clip_ratio/high_mean": 1.6212559330597287e-06, + "clip_ratio/low_mean": 1.9624552805908024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1245808738967753e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16271.0, + "completions/mean_length": 6866.6015625, + "completions/mean_terminated_length": 6791.66162109375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.9553637430071831, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023973705247044563, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 157343374.0, + "reward": 0.2890625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 7.46340665500611e-05, + "sampling/sampling_logp_difference/max": 9.502913475036621, + "sampling/sampling_logp_difference/mean": 0.021616388112306595, + "step": 188 + }, + { + "clip_ratio/high_max": 9.11087408894673e-06, + "clip_ratio/high_mean": 2.2777185222366825e-06, + "clip_ratio/low_mean": 3.832016966498486e-05, + "clip_ratio/low_min": 5.240211066848133e-06, + "clip_ratio/region_mean": 4.059788818722154e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14284.0, + "completions/mean_length": 6335.9453125, + "completions/mean_terminated_length": 5754.65283203125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.8574290797114372, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023072708863765, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 158173719.0, + "reward": 0.4140625, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.0001612449559615925, + "sampling/sampling_logp_difference/max": 8.732585906982422, + "sampling/sampling_logp_difference/mean": 0.018506702035665512, + "step": 189 + }, + { + "clip_ratio/high_max": 3.0578403084291494e-05, + "clip_ratio/high_mean": 9.993626633786334e-06, + "clip_ratio/low_mean": 5.610333710137638e-05, + "clip_ratio/low_min": 1.3168393707019277e-05, + "clip_ratio/region_mean": 6.609696265513776e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 8239.8984375, + "completions/mean_terminated_length": 7768.751953125, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "entropy": 0.8983379155397415, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004400993697345257, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 159248410.0, + "reward": 0.3125, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 1.1379369198039058e-06, + "sampling/sampling_logp_difference/max": 13.686293601989746, + "sampling/sampling_logp_difference/mean": 0.02096184343099594, + "step": 190 + }, + { + "clip_ratio/high_max": 9.026573934534099e-06, + "clip_ratio/high_mean": 2.2566434836335247e-06, + "clip_ratio/low_mean": 6.66748674120754e-05, + "clip_ratio/low_min": 1.5295650428015506e-05, + "clip_ratio/region_mean": 6.89315111230826e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13741.0, + "completions/mean_length": 6580.921875, + "completions/mean_terminated_length": 5659.26513671875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8277688398957253, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00661451555788517, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 160109904.0, + "reward": 0.484375, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.00036075623938813806, + "sampling/sampling_logp_difference/max": 7.927308082580566, + "sampling/sampling_logp_difference/mean": 0.017984790727496147, + "step": 191 + }, + { + "clip_ratio/high_max": 7.435806082867202e-06, + "clip_ratio/high_mean": 1.8589515207168006e-06, + "clip_ratio/low_mean": 4.045673085784074e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2315682549087796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 7627.0, + "completions/mean_terminated_length": 7416.83251953125, + "completions/min_length": 1916.0, + "completions/min_terminated_length": 1916.0, + "entropy": 0.8832443356513977, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004417019430547953, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 161103384.0, + "reward": 0.40625, + "reward_std": 0.3634909689426422, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 4.833659477299079e-05, + "sampling/sampling_logp_difference/max": 9.937321662902832, + "sampling/sampling_logp_difference/mean": 0.01947963796555996, + "step": 192 + }, + { + "clip_ratio/high_max": 9.941184316630824e-06, + "clip_ratio/high_mean": 2.485296079157706e-06, + "clip_ratio/low_mean": 2.6134909091979353e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8620205910101504e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 8426.1015625, + "completions/mean_terminated_length": 7965.72705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8188603445887566, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0030983765609562397, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 162199765.0, + "reward": 0.25, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411106109619, + "sampling/importance_sampling_ratio/min": 0.0009119694004766643, + "sampling/sampling_logp_difference/max": 6.999904155731201, + "sampling/sampling_logp_difference/mean": 0.02070600539445877, + "step": 193 + }, + { + "clip_ratio/high_max": 2.612139087432297e-05, + "clip_ratio/high_mean": 6.530347718580742e-06, + "clip_ratio/low_mean": 3.7853451885894174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.438379949078808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15904.0, + "completions/mean_length": 7154.2109375, + "completions/mean_terminated_length": 6856.4755859375, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "entropy": 0.9913735538721085, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003430198412388563, + "learning_rate": 1e-05, + "loss": 0.052, + "num_tokens": 163133232.0, + "reward": 0.4296875, + "reward_std": 0.2120065689086914, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.00042929715709760785, + "sampling/sampling_logp_difference/max": 7.753361225128174, + "sampling/sampling_logp_difference/mean": 0.02190260961651802, + "step": 194 + }, + { + "clip_ratio/high_max": 3.1841454983805306e-06, + "clip_ratio/high_mean": 7.960363745951327e-07, + "clip_ratio/low_mean": 3.384581600585079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4641852380445926e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 7693.1328125, + "completions/mean_terminated_length": 7412.7822265625, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "entropy": 0.9887127950787544, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002780586015433073, + "learning_rate": 1e-05, + "loss": 0.0449, + "num_tokens": 164134393.0, + "reward": 0.3515625, + "reward_std": 0.20411095023155212, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999028444290161, + "sampling/importance_sampling_ratio/min": 3.559096626304381e-07, + "sampling/sampling_logp_difference/max": 14.848588943481445, + "sampling/sampling_logp_difference/mean": 0.021110571920871735, + "step": 195 + }, + { + "clip_ratio/high_max": 9.770586984814145e-06, + "clip_ratio/high_mean": 5.008155312680174e-06, + "clip_ratio/low_mean": 5.182203130971175e-05, + "clip_ratio/low_min": 1.5574546068819473e-05, + "clip_ratio/region_mean": 5.683018616764457e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16129.0, + "completions/mean_length": 7072.1484375, + "completions/mean_terminated_length": 6771.76611328125, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "entropy": 0.861792616546154, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030156150460243225, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 165063412.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 2.2620308754994767e-06, + "sampling/sampling_logp_difference/max": 12.999247550964355, + "sampling/sampling_logp_difference/mean": 0.019325289875268936, + "step": 196 + }, + { + "clip_ratio/high_max": 2.2510209873871645e-05, + "clip_ratio/high_mean": 6.455301331698138e-06, + "clip_ratio/low_mean": 6.156819108582567e-05, + "clip_ratio/low_min": 5.763157332694391e-06, + "clip_ratio/region_mean": 6.802349253121065e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15062.0, + "completions/mean_length": 7353.421875, + "completions/mean_terminated_length": 7062.11279296875, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.8961873054504395, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034921523183584213, + "learning_rate": 1e-05, + "loss": 0.0161, + "num_tokens": 166024306.0, + "reward": 0.4609375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.0005124400486238301, + "sampling/sampling_logp_difference/max": 7.576326847076416, + "sampling/sampling_logp_difference/mean": 0.019593238830566406, + "step": 197 + }, + { + "clip_ratio/high_max": 1.3040991007073899e-05, + "clip_ratio/high_mean": 4.292725350296678e-06, + "clip_ratio/low_mean": 5.347559840629401e-05, + "clip_ratio/low_min": 6.613406640099129e-06, + "clip_ratio/region_mean": 5.776832381343411e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15604.0, + "completions/mean_length": 7348.03125, + "completions/mean_terminated_length": 6903.63916015625, + "completions/min_length": 1619.0, + "completions/min_terminated_length": 1619.0, + "entropy": 0.824029266834259, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027784397825598717, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 166984982.0, + "reward": 0.40625, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 0.0010020677000284195, + "sampling/sampling_logp_difference/max": 6.905689716339111, + "sampling/sampling_logp_difference/mean": 0.01857386901974678, + "step": 198 + }, + { + "clip_ratio/high_max": 3.330808067403268e-05, + "clip_ratio/high_mean": 1.0969530649163062e-05, + "clip_ratio/low_mean": 3.2080681648949394e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3050211388617754e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16358.0, + "completions/mean_length": 7290.4765625, + "completions/mean_terminated_length": 6920.82080078125, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8884479627013206, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004110465291887522, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 167936971.0, + "reward": 0.4375, + "reward_std": 0.35901516675949097, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 1.8145670992453233e-06, + "sampling/sampling_logp_difference/max": 13.219663619995117, + "sampling/sampling_logp_difference/mean": 0.019696572795510292, + "step": 199 + }, + { + "clip_ratio/high_max": 9.77357763076725e-06, + "clip_ratio/high_mean": 2.4433944076918124e-06, + "clip_ratio/low_mean": 3.466498992565903e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.710838473125477e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 7803.625, + "completions/mean_terminated_length": 6833.66943359375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "entropy": 0.8326860442757607, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002410614863038063, + "learning_rate": 1e-05, + "loss": 0.1147, + "num_tokens": 168955683.0, + "reward": 0.3828125, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 0.0008801451185718179, + "sampling/sampling_logp_difference/max": 7.035423755645752, + "sampling/sampling_logp_difference/mean": 0.018545793369412422, + "step": 200 + }, + { + "clip_ratio/high_max": 1.4602125929741305e-05, + "clip_ratio/high_mean": 3.6505314824353263e-06, + "clip_ratio/low_mean": 3.4781527119776e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8432058772741584e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6804.34375, + "completions/mean_terminated_length": 6495.322265625, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "entropy": 0.9669496119022369, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034376555122435093, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 169845823.0, + "reward": 0.3828125, + "reward_std": 0.31534504890441895, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 1.767780588579626e-08, + "sampling/sampling_logp_difference/max": 17.850955963134766, + "sampling/sampling_logp_difference/mean": 0.020515555515885353, + "step": 201 + }, + { + "clip_ratio/high_max": 1.5814722473805887e-05, + "clip_ratio/high_mean": 3.953680618451472e-06, + "clip_ratio/low_mean": 3.574208744794305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9695768407455034e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 6827.9609375, + "completions/mean_terminated_length": 6105.23583984375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.8833946585655212, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026675171684473753, + "learning_rate": 1e-05, + "loss": 0.1046, + "num_tokens": 170738210.0, + "reward": 0.421875, + "reward_std": 0.2698654532432556, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.002906275913119316, + "sampling/sampling_logp_difference/max": 5.840882778167725, + "sampling/sampling_logp_difference/mean": 0.019948139786720276, + "step": 202 + }, + { + "clip_ratio/high_max": 1.6623121837255894e-05, + "clip_ratio/high_mean": 4.1557804593139736e-06, + "clip_ratio/low_mean": 6.462372630267055e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.877950727357529e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15725.0, + "completions/mean_length": 7377.984375, + "completions/mean_terminated_length": 7307.07080078125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8881714344024658, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0039620306342840195, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 171705152.0, + "reward": 0.3359375, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999995231628418, + "sampling/importance_sampling_ratio/min": 2.4554079573135823e-05, + "sampling/sampling_logp_difference/max": 10.614632606506348, + "sampling/sampling_logp_difference/mean": 0.01964445412158966, + "step": 203 + }, + { + "clip_ratio/high_max": 9.639111340220552e-06, + "clip_ratio/high_mean": 2.409777835055138e-06, + "clip_ratio/low_mean": 2.775239624952519e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0162174198267167e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15265.0, + "completions/mean_length": 6051.8828125, + "completions/mean_terminated_length": 5543.74560546875, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.8851477280259132, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0040458571165800095, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 172501881.0, + "reward": 0.4296875, + "reward_std": 0.16781240701675415, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999410510063171, + "sampling/importance_sampling_ratio/min": 0.0021976607386022806, + "sampling/sampling_logp_difference/max": 6.120361804962158, + "sampling/sampling_logp_difference/mean": 0.01957303285598755, + "step": 204 + }, + { + "clip_ratio/high_max": 9.72708312474424e-06, + "clip_ratio/high_mean": 3.529455852913088e-06, + "clip_ratio/low_mean": 5.158422732165491e-05, + "clip_ratio/low_min": 1.1939961495954776e-05, + "clip_ratio/region_mean": 5.5113683174567996e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7830.171875, + "completions/mean_terminated_length": 7409.4912109375, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.9070459827780724, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005941574461758137, + "learning_rate": 1e-05, + "loss": 0.0427, + "num_tokens": 173522391.0, + "reward": 0.34375, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.00011712420382536948, + "sampling/sampling_logp_difference/max": 9.052275657653809, + "sampling/sampling_logp_difference/mean": 0.021295130252838135, + "step": 205 + }, + { + "clip_ratio/high_max": 5.5543214330100454e-06, + "clip_ratio/high_mean": 1.3885803582525114e-06, + "clip_ratio/low_mean": 1.718775109793569e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8576331683561875e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15443.0, + "completions/mean_length": 7520.6796875, + "completions/mean_terminated_length": 6769.55078125, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.8843575045466423, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025851845275610685, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 174504534.0, + "reward": 0.4765625, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 0.00039556476986035705, + "sampling/sampling_logp_difference/max": 7.835196018218994, + "sampling/sampling_logp_difference/mean": 0.02016005665063858, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0145481155632297e-05, + "clip_ratio/high_mean": 2.536370288908074e-06, + "clip_ratio/low_mean": 3.617897255026037e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.871534295285528e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16173.0, + "completions/mean_length": 7382.1875, + "completions/mean_terminated_length": 6861.42138671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "entropy": 0.916313610970974, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004170550964772701, + "learning_rate": 1e-05, + "loss": 0.047, + "num_tokens": 175472574.0, + "reward": 0.46875, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999932646751404, + "sampling/importance_sampling_ratio/min": 2.8054744689143263e-05, + "sampling/sampling_logp_difference/max": 10.481352806091309, + "sampling/sampling_logp_difference/mean": 0.020749717950820923, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.83663013963087e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.83663013963087e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13891.0, + "completions/mean_length": 6122.453125, + "completions/mean_terminated_length": 6041.6533203125, + "completions/min_length": 1192.0, + "completions/min_terminated_length": 1192.0, + "entropy": 0.8984386026859283, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 176275568.0, + "reward": 0.4765625, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 7.88934721640544e-06, + "sampling/sampling_logp_difference/max": 11.74999713897705, + "sampling/sampling_logp_difference/mean": 0.020278753712773323, + "step": 208 + }, + { + "clip_ratio/high_max": 1.4535152331518475e-05, + "clip_ratio/high_mean": 3.6337880828796187e-06, + "clip_ratio/low_mean": 4.3961883989140915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7595671958333696e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15547.0, + "completions/mean_length": 4983.2890625, + "completions/mean_terminated_length": 4709.67236328125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.825260303914547, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004848882555961609, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 176932549.0, + "reward": 0.6484375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616146087646, + "sampling/importance_sampling_ratio/min": 1.626804078114219e-05, + "sampling/sampling_logp_difference/max": 11.026308059692383, + "sampling/sampling_logp_difference/mean": 0.017959970980882645, + "step": 209 + }, + { + "clip_ratio/high_max": 1.1141860795760294e-05, + "clip_ratio/high_mean": 2.7854651989400736e-06, + "clip_ratio/low_mean": 4.2418692146384274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5204157913758536e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15415.0, + "completions/mean_length": 5766.5234375, + "completions/mean_terminated_length": 5511.7041015625, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.9016259610652924, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004749474115669727, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 177691752.0, + "reward": 0.5, + "reward_std": 0.2738044261932373, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000141859054565, + "sampling/importance_sampling_ratio/min": 8.927558155846782e-06, + "sampling/sampling_logp_difference/max": 11.626367568969727, + "sampling/sampling_logp_difference/mean": 0.019118282943964005, + "step": 210 + }, + { + "clip_ratio/high_max": 5.5243735914700665e-06, + "clip_ratio/high_mean": 2.1587275114143267e-06, + "clip_ratio/low_mean": 4.609663824339805e-05, + "clip_ratio/low_min": 3.983555870945565e-06, + "clip_ratio/region_mean": 4.8255366664307076e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15696.0, + "completions/mean_length": 6993.671875, + "completions/mean_terminated_length": 6768.30419921875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.9074988812208176, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004418120253831148, + "learning_rate": 1e-05, + "loss": 0.1135, + "num_tokens": 178603454.0, + "reward": 0.5390625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000037670135498, + "sampling/importance_sampling_ratio/min": 0.0018135923892259598, + "sampling/sampling_logp_difference/max": 6.312445640563965, + "sampling/sampling_logp_difference/mean": 0.01957814022898674, + "step": 211 + }, + { + "clip_ratio/high_max": 5.126943051436683e-06, + "clip_ratio/high_mean": 1.2817357628591708e-06, + "clip_ratio/low_mean": 2.7488794444252562e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.877053032079857e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 7445.1328125, + "completions/mean_terminated_length": 6849.20849609375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9255013465881348, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00237120408564806, + "learning_rate": 1e-05, + "loss": 0.0172, + "num_tokens": 179577063.0, + "reward": 0.40625, + "reward_std": 0.21040897071361542, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999725818634033, + "sampling/importance_sampling_ratio/min": 9.651589061832055e-05, + "sampling/sampling_logp_difference/max": 9.245802879333496, + "sampling/sampling_logp_difference/mean": 0.02165937051177025, + "step": 212 + }, + { + "clip_ratio/high_max": 1.8956294752570102e-05, + "clip_ratio/high_mean": 4.7390736881425255e-06, + "clip_ratio/low_mean": 2.6486316301088664e-05, + "clip_ratio/low_min": 3.516273409331916e-06, + "clip_ratio/region_mean": 3.122539010291803e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 6120.5546875, + "completions/mean_terminated_length": 5703.34130859375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.8181199952960014, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004715202376246452, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 180380422.0, + "reward": 0.5, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999874472618103, + "sampling/importance_sampling_ratio/min": 0.004350374918431044, + "sampling/sampling_logp_difference/max": 5.437493324279785, + "sampling/sampling_logp_difference/mean": 0.018377620726823807, + "step": 213 + }, + { + "clip_ratio/high_max": 5.594843969447538e-06, + "clip_ratio/high_mean": 2.376495558564784e-06, + "clip_ratio/low_mean": 3.4097628713425365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6474124044616474e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 6351.203125, + "completions/mean_terminated_length": 5857.78662109375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.8798654451966286, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003063712501898408, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 181212776.0, + "reward": 0.453125, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999946355819702, + "sampling/importance_sampling_ratio/min": 7.891544555604924e-06, + "sampling/sampling_logp_difference/max": 11.74971866607666, + "sampling/sampling_logp_difference/mean": 0.019523698836565018, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.544438988001275e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.544438988001275e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14180.0, + "completions/mean_length": 6330.046875, + "completions/mean_terminated_length": 6170.46044921875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.8319354206323624, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033194730058312416, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 182041910.0, + "reward": 0.453125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998994469642639, + "sampling/importance_sampling_ratio/min": 0.00010535263572819531, + "sampling/sampling_logp_difference/max": 9.158197402954102, + "sampling/sampling_logp_difference/mean": 0.018981872126460075, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7156292415165808e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7156292415165808e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15982.0, + "completions/mean_length": 6665.2890625, + "completions/mean_terminated_length": 6351.7822265625, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9336326420307159, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004492956213653088, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 182914843.0, + "reward": 0.3828125, + "reward_std": 0.14807432889938354, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.011399568989872932, + "sampling/sampling_logp_difference/max": 4.474179744720459, + "sampling/sampling_logp_difference/mean": 0.02088768407702446, + "step": 216 + }, + { + "clip_ratio/high_max": 3.2495465802639956e-05, + "clip_ratio/high_mean": 9.084843100026774e-06, + "clip_ratio/low_mean": 5.4809036328151706e-05, + "clip_ratio/low_min": 8.953898031904828e-06, + "clip_ratio/region_mean": 6.389387954186532e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16064.0, + "completions/mean_length": 5393.9140625, + "completions/mean_terminated_length": 5039.39501953125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.7864786610007286, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003816079581156373, + "learning_rate": 1e-05, + "loss": -0.004, + "num_tokens": 183628152.0, + "reward": 0.546875, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.003246711567044258, + "sampling/sampling_logp_difference/max": 5.730112552642822, + "sampling/sampling_logp_difference/mean": 0.018448319286108017, + "step": 217 + }, + { + "clip_ratio/high_max": 8.638648068881594e-06, + "clip_ratio/high_mean": 2.1596620172203984e-06, + "clip_ratio/low_mean": 1.6896704778446292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9056366909353528e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15453.0, + "completions/mean_length": 7161.5, + "completions/mean_terminated_length": 7015.111328125, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.915394201874733, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003666195785626769, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 184562352.0, + "reward": 0.3671875, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00025550799909979105, + "sampling/sampling_logp_difference/max": 8.272256851196289, + "sampling/sampling_logp_difference/mean": 0.019755780696868896, + "step": 218 + }, + { + "clip_ratio/high_max": 6.424931598303374e-06, + "clip_ratio/high_mean": 1.6062328995758435e-06, + "clip_ratio/low_mean": 2.49038239417132e-05, + "clip_ratio/low_min": 4.00025601265952e-06, + "clip_ratio/region_mean": 2.651005689813246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15408.0, + "completions/mean_length": 7957.671875, + "completions/mean_terminated_length": 7685.8544921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 1.1176252663135529, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025940234772861004, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 185606670.0, + "reward": 0.1171875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.0007622809498570859, + "sampling/sampling_logp_difference/max": 7.179195404052734, + "sampling/sampling_logp_difference/mean": 0.02338646724820137, + "step": 219 + }, + { + "clip_ratio/high_max": 1.9903963220713194e-05, + "clip_ratio/high_mean": 5.829163114867697e-06, + "clip_ratio/low_mean": 4.4742550926457625e-05, + "clip_ratio/low_min": 3.5803282116830815e-06, + "clip_ratio/region_mean": 5.057171370026481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 7060.6640625, + "completions/mean_terminated_length": 6759.9111328125, + "completions/min_length": 1460.0, + "completions/min_terminated_length": 1460.0, + "entropy": 0.9148540124297142, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004315398633480072, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 186526883.0, + "reward": 0.5078125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0004585353017318994, + "sampling/sampling_logp_difference/max": 7.687473297119141, + "sampling/sampling_logp_difference/mean": 0.01967843994498253, + "step": 220 + }, + { + "clip_ratio/high_max": 1.147099328591139e-05, + "clip_ratio/high_mean": 2.8677483214778476e-06, + "clip_ratio/low_mean": 2.8967988555450574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1835736763241584e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15596.0, + "completions/mean_length": 6649.6640625, + "completions/mean_terminated_length": 6416.04052734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9298559054732323, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030786178540438414, + "learning_rate": 1e-05, + "loss": 0.0606, + "num_tokens": 187397536.0, + "reward": 0.4453125, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005841255188, + "sampling/importance_sampling_ratio/min": 3.2821125728332845e-07, + "sampling/sampling_logp_difference/max": 14.929608345031738, + "sampling/sampling_logp_difference/mean": 0.020215414464473724, + "step": 221 + }, + { + "clip_ratio/high_max": 2.2768570943298982e-05, + "clip_ratio/high_mean": 5.692142735824746e-06, + "clip_ratio/low_mean": 3.249637484259438e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8188517464732286e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 8292.015625, + "completions/mean_terminated_length": 7823.8837890625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.8232023045420647, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002438523108139634, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 188477778.0, + "reward": 0.328125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.005636279005557299, + "sampling/sampling_logp_difference/max": 5.178531169891357, + "sampling/sampling_logp_difference/mean": 0.018984414637088776, + "step": 222 + }, + { + "clip_ratio/high_max": 2.0840709566982696e-05, + "clip_ratio/high_mean": 6.135253556749376e-06, + "clip_ratio/low_mean": 2.255633432923787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.869158777230041e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15991.0, + "completions/mean_length": 7600.9765625, + "completions/mean_terminated_length": 6936.71484375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.8689917623996735, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004773247055709362, + "learning_rate": 1e-05, + "loss": 0.0486, + "num_tokens": 189470655.0, + "reward": 0.40625, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.001327168894931674, + "sampling/sampling_logp_difference/max": 6.624707221984863, + "sampling/sampling_logp_difference/mean": 0.018666012212634087, + "step": 223 + }, + { + "clip_ratio/high_max": 9.837458947004052e-06, + "clip_ratio/high_mean": 2.459364736751013e-06, + "clip_ratio/low_mean": 6.463955219260242e-05, + "clip_ratio/low_min": 1.0895145351241808e-05, + "clip_ratio/region_mean": 6.70989177251613e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 7600.34375, + "completions/mean_terminated_length": 6855.96630859375, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "entropy": 0.7636929750442505, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004298723768442869, + "learning_rate": 1e-05, + "loss": 0.145, + "num_tokens": 190462227.0, + "reward": 0.515625, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999310374259949, + "sampling/importance_sampling_ratio/min": 4.5565320760942996e-05, + "sampling/sampling_logp_difference/max": 9.996363639831543, + "sampling/sampling_logp_difference/mean": 0.018035393208265305, + "step": 224 + }, + { + "clip_ratio/high_max": 1.4060602325116633e-05, + "clip_ratio/high_mean": 3.5151505812791584e-06, + "clip_ratio/low_mean": 2.6516039497437305e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.003119024924672e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15151.0, + "completions/mean_length": 6512.0, + "completions/mean_terminated_length": 6434.267578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.9043584689497948, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006741553544998169, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 191312483.0, + "reward": 0.484375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 1.778468504198827e-05, + "sampling/sampling_logp_difference/max": 10.937172889709473, + "sampling/sampling_logp_difference/mean": 0.020878732204437256, + "step": 225 + }, + { + "clip_ratio/high_max": 1.7356085209030425e-05, + "clip_ratio/high_mean": 4.339021302257606e-06, + "clip_ratio/low_mean": 2.8831826739406097e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.317084781429003e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16339.0, + "completions/mean_length": 7178.6875, + "completions/mean_terminated_length": 6565.00048828125, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.8899475410580635, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00281486171297729, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 192251235.0, + "reward": 0.3984375, + "reward_std": 0.2240736484527588, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999714493751526, + "sampling/importance_sampling_ratio/min": 9.012543159769848e-05, + "sampling/sampling_logp_difference/max": 9.314308166503906, + "sampling/sampling_logp_difference/mean": 0.020196784287691116, + "step": 226 + }, + { + "clip_ratio/high_max": 1.5558084214717383e-05, + "clip_ratio/high_mean": 3.889521053679346e-06, + "clip_ratio/low_mean": 3.0248688972278615e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.413820991227112e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15501.0, + "completions/max_terminated_length": 15501.0, + "completions/mean_length": 6602.5625, + "completions/mean_terminated_length": 6602.5625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.9266818463802338, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005070593673735857, + "learning_rate": 1e-05, + "loss": 0.0781, + "num_tokens": 193116763.0, + "reward": 0.53125, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999746680259705, + "sampling/importance_sampling_ratio/min": 2.726537559283315e-06, + "sampling/sampling_logp_difference/max": 12.812478065490723, + "sampling/sampling_logp_difference/mean": 0.020026464015245438, + "step": 227 + }, + { + "clip_ratio/high_max": 4.188727416476468e-06, + "clip_ratio/high_mean": 1.047181854119117e-06, + "clip_ratio/low_mean": 2.959152834591805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.063871008635033e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6818.8828125, + "completions/mean_terminated_length": 6430.056640625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.874519519507885, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006362155079841614, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 194007868.0, + "reward": 0.4765625, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.0005216691642999649, + "sampling/sampling_logp_difference/max": 7.55847692489624, + "sampling/sampling_logp_difference/mean": 0.01943325623869896, + "step": 228 + }, + { + "clip_ratio/high_max": 9.645911177358357e-06, + "clip_ratio/high_mean": 2.4114777943395893e-06, + "clip_ratio/low_mean": 6.821557258263056e-05, + "clip_ratio/low_min": 1.7265090718865395e-05, + "clip_ratio/region_mean": 7.062705049065698e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14536.0, + "completions/mean_length": 5515.625, + "completions/mean_terminated_length": 5343.111328125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0683523043990135, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003797185141593218, + "learning_rate": 1e-05, + "loss": 0.1061, + "num_tokens": 194735980.0, + "reward": 0.421875, + "reward_std": 0.34010058641433716, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911367893219, + "sampling/importance_sampling_ratio/min": 1.137102216830499e-07, + "sampling/sampling_logp_difference/max": 15.989612579345703, + "sampling/sampling_logp_difference/mean": 0.02120930328965187, + "step": 229 + }, + { + "clip_ratio/high_max": 2.1971412252241862e-05, + "clip_ratio/high_mean": 5.4928530630604655e-06, + "clip_ratio/low_mean": 4.9151800567415194e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4644653801005916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14992.0, + "completions/mean_length": 5853.546875, + "completions/mean_terminated_length": 5770.6298828125, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.7975900694727898, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004124365746974945, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 195504882.0, + "reward": 0.5859375, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.0032877910416573286, + "sampling/sampling_logp_difference/max": 5.717539310455322, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 230 + }, + { + "clip_ratio/high_max": 7.066538728395244e-06, + "clip_ratio/high_mean": 2.843255515472265e-06, + "clip_ratio/low_mean": 5.1467116236381116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.431037175185338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 6686.25, + "completions/mean_terminated_length": 6532.31787109375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.9018580466508865, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024995009880512953, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 196379306.0, + "reward": 0.421875, + "reward_std": 0.35824593901634216, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999300837516785, + "sampling/importance_sampling_ratio/min": 2.0017207134515047e-05, + "sampling/sampling_logp_difference/max": 10.818918228149414, + "sampling/sampling_logp_difference/mean": 0.018989525735378265, + "step": 231 + }, + { + "clip_ratio/high_max": 6.652828687947476e-06, + "clip_ratio/high_mean": 2.5722979444253724e-06, + "clip_ratio/low_mean": 3.699686294567073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.95691608900961e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16347.0, + "completions/mean_length": 7487.3359375, + "completions/mean_terminated_length": 7200.3466796875, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.9890001565217972, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004295211285352707, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 197357397.0, + "reward": 0.40625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0006548459641635418, + "sampling/sampling_logp_difference/max": 7.33111047744751, + "sampling/sampling_logp_difference/mean": 0.02209121733903885, + "step": 232 + }, + { + "clip_ratio/high_max": 6.0850939007650595e-06, + "clip_ratio/high_mean": 1.5212734751912649e-06, + "clip_ratio/low_mean": 2.9443070673096372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0964344205131056e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7233.484375, + "completions/mean_terminated_length": 6938.30615234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.9683803990483284, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003119673579931259, + "learning_rate": 1e-05, + "loss": 0.0914, + "num_tokens": 198303795.0, + "reward": 0.328125, + "reward_std": 0.23014704883098602, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000243186950684, + "sampling/importance_sampling_ratio/min": 0.020358745008707047, + "sampling/sampling_logp_difference/max": 3.89424467086792, + "sampling/sampling_logp_difference/mean": 0.021085180342197418, + "step": 233 + }, + { + "clip_ratio/high_max": 7.963812095113099e-06, + "clip_ratio/high_mean": 1.9909530237782747e-06, + "clip_ratio/low_mean": 4.031422963635123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.23051826601295e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15733.0, + "completions/mean_length": 6457.78125, + "completions/mean_terminated_length": 6300.22265625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.8881053999066353, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033790848683565855, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 199154735.0, + "reward": 0.3828125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 2.872048128210736e-07, + "sampling/sampling_logp_difference/max": 15.063070297241211, + "sampling/sampling_logp_difference/mean": 0.01950821653008461, + "step": 234 + }, + { + "clip_ratio/high_max": 9.059622016138746e-06, + "clip_ratio/high_mean": 3.3430123380639998e-06, + "clip_ratio/low_mean": 2.2856192117615137e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6199204512522556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 7904.40625, + "completions/mean_terminated_length": 7769.81005859375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9881557524204254, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021492803934961557, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 200185643.0, + "reward": 0.359375, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.001458622980862856, + "sampling/sampling_logp_difference/max": 6.530262470245361, + "sampling/sampling_logp_difference/mean": 0.021201875060796738, + "step": 235 + }, + { + "clip_ratio/high_max": 6.9962839006620925e-06, + "clip_ratio/high_mean": 1.7490709751655231e-06, + "clip_ratio/low_mean": 3.018811844412994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193718976035598e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15328.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 7414.4921875, + "completions/mean_terminated_length": 7414.4921875, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "entropy": 0.9571134969592094, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037221095990389585, + "learning_rate": 1e-05, + "loss": 0.0327, + "num_tokens": 201153114.0, + "reward": 0.4375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999958872795105, + "sampling/importance_sampling_ratio/min": 0.0009130563121289015, + "sampling/sampling_logp_difference/max": 6.99871301651001, + "sampling/sampling_logp_difference/mean": 0.021356744691729546, + "step": 236 + }, + { + "clip_ratio/high_max": 1.1248092050664127e-05, + "clip_ratio/high_mean": 2.8120230126660317e-06, + "clip_ratio/low_mean": 5.4354991334548686e-05, + "clip_ratio/low_min": 6.868132004456129e-06, + "clip_ratio/region_mean": 5.716701480196207e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15835.0, + "completions/max_terminated_length": 15835.0, + "completions/mean_length": 5955.953125, + "completions/mean_terminated_length": 5955.953125, + "completions/min_length": 1394.0, + "completions/min_terminated_length": 1394.0, + "entropy": 0.730999618768692, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006285305600613356, + "learning_rate": 1e-05, + "loss": 0.0641, + "num_tokens": 201933044.0, + "reward": 0.59375, + "reward_std": 0.31011277437210083, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.007535050623118877, + "sampling/sampling_logp_difference/max": 4.888189792633057, + "sampling/sampling_logp_difference/mean": 0.016975615173578262, + "step": 237 + }, + { + "clip_ratio/high_max": 7.226686648209579e-06, + "clip_ratio/high_mean": 3.094216481258627e-06, + "clip_ratio/low_mean": 4.66828214484849e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.977703792974353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 6923.3515625, + "completions/mean_terminated_length": 6458.0732421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9938417226076126, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005667983554303646, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 202837281.0, + "reward": 0.2578125, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 3.0342773243319243e-05, + "sampling/sampling_logp_difference/max": 10.402952194213867, + "sampling/sampling_logp_difference/mean": 0.022059854120016098, + "step": 238 + }, + { + "clip_ratio/high_max": 5.2318769121484365e-06, + "clip_ratio/high_mean": 1.3079692280371091e-06, + "clip_ratio/low_mean": 4.239228087499214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3700250216716086e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14726.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 5930.9296875, + "completions/mean_terminated_length": 5930.9296875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.8100385963916779, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004052883945405483, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 203614448.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999989926815033, + "sampling/importance_sampling_ratio/min": 0.00015170808183029294, + "sampling/sampling_logp_difference/max": 8.79355239868164, + "sampling/sampling_logp_difference/mean": 0.018519222736358643, + "step": 239 + }, + { + "clip_ratio/high_max": 4.905230980511988e-06, + "clip_ratio/high_mean": 1.226307745127997e-06, + "clip_ratio/low_mean": 5.500513248080097e-05, + "clip_ratio/low_min": 7.924934834591113e-06, + "clip_ratio/region_mean": 5.6231440112242126e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14996.0, + "completions/mean_length": 6911.1015625, + "completions/mean_terminated_length": 6108.3134765625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.9260227829217911, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004494607914239168, + "learning_rate": 1e-05, + "loss": 0.0269, + "num_tokens": 204518261.0, + "reward": 0.4140625, + "reward_std": 0.34033796191215515, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.0015266009140759706, + "sampling/sampling_logp_difference/max": 6.484711647033691, + "sampling/sampling_logp_difference/mean": 0.020527629181742668, + "step": 240 + }, + { + "clip_ratio/high_max": 8.293764039990492e-06, + "clip_ratio/high_mean": 2.073441009997623e-06, + "clip_ratio/low_mean": 4.75325257411896e-05, + "clip_ratio/low_min": 3.599504680096288e-06, + "clip_ratio/region_mean": 4.960596663750039e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14637.0, + "completions/mean_length": 6972.921875, + "completions/mean_terminated_length": 6823.5400390625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 1.0095533654093742, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029451537411659956, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 205433843.0, + "reward": 0.3515625, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 2.6675223125494085e-05, + "sampling/sampling_logp_difference/max": 10.53177547454834, + "sampling/sampling_logp_difference/mean": 0.02013089321553707, + "step": 241 + }, + { + "clip_ratio/high_max": 4.163383164268453e-05, + "clip_ratio/high_mean": 1.382379150527413e-05, + "clip_ratio/low_mean": 3.86000854177837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2423876240936806e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 6706.6640625, + "completions/mean_terminated_length": 6313.2763671875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 0.8647518903017044, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003371767932549119, + "learning_rate": 1e-05, + "loss": 0.073, + "num_tokens": 206310296.0, + "reward": 0.5703125, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 2.948181463580113e-05, + "sampling/sampling_logp_difference/max": 10.431736946105957, + "sampling/sampling_logp_difference/mean": 0.019770190119743347, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4946740381892596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4946740381892596e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16136.0, + "completions/mean_length": 6882.609375, + "completions/mean_terminated_length": 6415.32763671875, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "entropy": 1.013342760503292, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016336971893906593, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 207210974.0, + "reward": 0.359375, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999210834503174, + "sampling/importance_sampling_ratio/min": 0.0013267879839986563, + "sampling/sampling_logp_difference/max": 6.624994277954102, + "sampling/sampling_logp_difference/mean": 0.02139991894364357, + "step": 243 + }, + { + "clip_ratio/high_max": 1.4866403944324702e-05, + "clip_ratio/high_mean": 3.7166009860811755e-06, + "clip_ratio/low_mean": 3.938925010515959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.310585177336179e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15203.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 6195.7421875, + "completions/mean_terminated_length": 6195.7421875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.8448907434940338, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005036406684666872, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 208021893.0, + "reward": 0.5234375, + "reward_std": 0.3453505039215088, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955892562866, + "sampling/importance_sampling_ratio/min": 0.0040348549373447895, + "sampling/sampling_logp_difference/max": 5.512784957885742, + "sampling/sampling_logp_difference/mean": 0.018679853528738022, + "step": 244 + }, + { + "clip_ratio/high_max": 1.1244883353356272e-05, + "clip_ratio/high_mean": 2.811220838339068e-06, + "clip_ratio/low_mean": 3.422392001084518e-05, + "clip_ratio/low_min": 6.451612989621935e-06, + "clip_ratio/region_mean": 3.703514119024476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 6829.609375, + "completions/mean_terminated_length": 6521.40283203125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.8679579794406891, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029643685556948185, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 208912059.0, + "reward": 0.46875, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00038063788088038564, + "sampling/sampling_logp_difference/max": 7.873661994934082, + "sampling/sampling_logp_difference/mean": 0.018488366156816483, + "step": 245 + }, + { + "clip_ratio/high_max": 2.2700600311509334e-05, + "clip_ratio/high_mean": 5.675150077877333e-06, + "clip_ratio/low_mean": 3.138338854569156e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.705853873725573e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14503.0, + "completions/max_terminated_length": 14503.0, + "completions/mean_length": 5444.4453125, + "completions/mean_terminated_length": 5444.4453125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 1.0460086688399315, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035942886024713516, + "learning_rate": 1e-05, + "loss": 0.0932, + "num_tokens": 209627804.0, + "reward": 0.484375, + "reward_std": 0.338498055934906, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997478723526, + "sampling/importance_sampling_ratio/min": 0.03179635480046272, + "sampling/sampling_logp_difference/max": 3.4484035968780518, + "sampling/sampling_logp_difference/mean": 0.020146891474723816, + "step": 246 + }, + { + "clip_ratio/high_max": 1.477029400120955e-05, + "clip_ratio/high_mean": 4.552578502625693e-06, + "clip_ratio/low_mean": 5.265122354103369e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.720380158891203e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 7657.390625, + "completions/mean_terminated_length": 7152.544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "entropy": 0.9528728649020195, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0044983453117311, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 210630150.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 3.8753667467972264e-05, + "sampling/sampling_logp_difference/max": 10.158285140991211, + "sampling/sampling_logp_difference/mean": 0.02131088823080063, + "step": 247 + }, + { + "clip_ratio/high_max": 8.607642712377128e-06, + "clip_ratio/high_mean": 2.151910678094282e-06, + "clip_ratio/low_mean": 2.2759413695894182e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.491132454451872e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7574.3515625, + "completions/mean_terminated_length": 7504.984375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 1.0009776800870895, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006095650140196085, + "learning_rate": 1e-05, + "loss": 0.0566, + "num_tokens": 211620355.0, + "reward": 0.3515625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.0013946897815912962, + "sampling/sampling_logp_difference/max": 6.575083255767822, + "sampling/sampling_logp_difference/mean": 0.021727774292230606, + "step": 248 + }, + { + "clip_ratio/high_max": 1.764823082339717e-05, + "clip_ratio/high_mean": 5.141430960975413e-06, + "clip_ratio/low_mean": 5.936152001595474e-05, + "clip_ratio/low_min": 9.155588486464694e-06, + "clip_ratio/region_mean": 6.450295177273802e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14915.0, + "completions/mean_length": 7919.6875, + "completions/mean_terminated_length": 7716.54443359375, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "entropy": 1.0405654236674309, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037038614973425865, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 212654747.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.0057550109922885895, + "sampling/sampling_logp_difference/max": 5.157684326171875, + "sampling/sampling_logp_difference/mean": 0.022051017731428146, + "step": 249 + }, + { + "clip_ratio/high_max": 1.265254240934155e-05, + "clip_ratio/high_mean": 3.1631356023353874e-06, + "clip_ratio/low_mean": 4.716233138424286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.032546687289141e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 8613.4765625, + "completions/mean_terminated_length": 7735.0693359375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.890489287674427, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00325607368722558, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 213774584.0, + "reward": 0.40625, + "reward_std": 0.33668074011802673, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 1.670176425250247e-05, + "sampling/sampling_logp_difference/max": 10.999996185302734, + "sampling/sampling_logp_difference/mean": 0.020002499222755432, + "step": 250 + }, + { + "clip_ratio/high_max": 1.6404605503339553e-05, + "clip_ratio/high_mean": 4.101151375834888e-06, + "clip_ratio/low_mean": 3.880500707964529e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2906158682853857e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7324.8984375, + "completions/mean_terminated_length": 6473.1884765625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.761004202067852, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038265211042016745, + "learning_rate": 1e-05, + "loss": 0.0717, + "num_tokens": 214728371.0, + "reward": 0.515625, + "reward_std": 0.32719239592552185, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168085098267, + "sampling/importance_sampling_ratio/min": 0.0003049026126973331, + "sampling/sampling_logp_difference/max": 8.095518112182617, + "sampling/sampling_logp_difference/mean": 0.018367979675531387, + "step": 251 + }, + { + "clip_ratio/high_max": 5.624549885396846e-06, + "clip_ratio/high_mean": 1.4061374713492114e-06, + "clip_ratio/low_mean": 3.6433707123251224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7839844594600436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14167.0, + "completions/max_terminated_length": 14167.0, + "completions/mean_length": 6422.0859375, + "completions/mean_terminated_length": 6422.0859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.9946094751358032, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002729539293795824, + "learning_rate": 1e-05, + "loss": 0.0158, + "num_tokens": 215570806.0, + "reward": 0.3515625, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999935030937195, + "sampling/importance_sampling_ratio/min": 0.026308411732316017, + "sampling/sampling_logp_difference/max": 3.637866497039795, + "sampling/sampling_logp_difference/mean": 0.021903935819864273, + "step": 252 + }, + { + "clip_ratio/high_max": 7.2379848461423535e-06, + "clip_ratio/high_mean": 1.8094962115355884e-06, + "clip_ratio/low_mean": 3.17277934982485e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353728982347093e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15585.0, + "completions/mean_length": 6845.2890625, + "completions/mean_terminated_length": 6693.88134765625, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8822609707713127, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004974282346665859, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 216465635.0, + "reward": 0.5390625, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000090599060059, + "sampling/importance_sampling_ratio/min": 8.749838889343664e-05, + "sampling/sampling_logp_difference/max": 9.343890190124512, + "sampling/sampling_logp_difference/mean": 0.019389234483242035, + "step": 253 + }, + { + "clip_ratio/high_max": 1.58592818024772e-05, + "clip_ratio/high_mean": 3.9648204506193e-06, + "clip_ratio/low_mean": 4.096964960353944e-05, + "clip_ratio/low_min": 1.7403560605089297e-05, + "clip_ratio/region_mean": 4.49344687467601e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 7805.484375, + "completions/mean_terminated_length": 7528.7578125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.9977599084377289, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0033159854356199503, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 217485089.0, + "reward": 0.421875, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 7.967943383846432e-05, + "sampling/sampling_logp_difference/max": 9.437499046325684, + "sampling/sampling_logp_difference/mean": 0.021925684064626694, + "step": 254 + }, + { + "clip_ratio/high_max": 1.8265397557115648e-05, + "clip_ratio/high_mean": 4.566349389278912e-06, + "clip_ratio/low_mean": 4.044636898470344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5012717691861326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15681.0, + "completions/mean_length": 7737.5546875, + "completions/mean_terminated_length": 7530.04052734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.8667014688253403, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034952745772898197, + "learning_rate": 1e-05, + "loss": 0.0775, + "num_tokens": 218496040.0, + "reward": 0.453125, + "reward_std": 0.3085102438926697, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999128580093384, + "sampling/importance_sampling_ratio/min": 6.726370338583365e-05, + "sampling/sampling_logp_difference/max": 9.606889724731445, + "sampling/sampling_logp_difference/mean": 0.019742710515856743, + "step": 255 + }, + { + "clip_ratio/high_max": 8.244294804171659e-06, + "clip_ratio/high_mean": 2.0610737010429148e-06, + "clip_ratio/low_mean": 3.204250072030845e-05, + "clip_ratio/low_min": 3.323495775475749e-06, + "clip_ratio/region_mean": 3.410357436450795e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15858.0, + "completions/mean_length": 7365.84375, + "completions/mean_terminated_length": 6601.59326171875, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "entropy": 0.8151945173740387, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038676802068948746, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 219459140.0, + "reward": 0.46875, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00023387260443996638, + "sampling/sampling_logp_difference/max": 8.360733985900879, + "sampling/sampling_logp_difference/mean": 0.018882082775235176, + "step": 256 + }, + { + "clip_ratio/high_max": 6.87833608026267e-06, + "clip_ratio/high_mean": 2.9462287329806713e-06, + "clip_ratio/low_mean": 5.435333650893881e-05, + "clip_ratio/low_min": 5.33937054569833e-06, + "clip_ratio/region_mean": 5.729956546929316e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 6448.0078125, + "completions/mean_terminated_length": 6369.771484375, + "completions/min_length": 1128.0, + "completions/min_terminated_length": 1128.0, + "entropy": 0.9546648040413857, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004310046322643757, + "learning_rate": 1e-05, + "loss": 0.1082, + "num_tokens": 220304605.0, + "reward": 0.5703125, + "reward_std": 0.35611939430236816, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999396800994873, + "sampling/importance_sampling_ratio/min": 0.0001234127557836473, + "sampling/sampling_logp_difference/max": 8.99997615814209, + "sampling/sampling_logp_difference/mean": 0.020253397524356842, + "step": 257 + }, + { + "clip_ratio/high_max": 6.196094091137638e-06, + "clip_ratio/high_mean": 1.5490235227844096e-06, + "clip_ratio/low_mean": 2.5416685957679874e-05, + "clip_ratio/low_min": 5.5736391004757024e-06, + "clip_ratio/region_mean": 2.696570959415112e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 7457.6484375, + "completions/mean_terminated_length": 6941.24755859375, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "entropy": 0.8182889074087143, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026646999176591635, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 221281968.0, + "reward": 0.4453125, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173283576965, + "sampling/importance_sampling_ratio/min": 2.902353571698768e-06, + "sampling/sampling_logp_difference/max": 12.749988555908203, + "sampling/sampling_logp_difference/mean": 0.019208962097764015, + "step": 258 + }, + { + "clip_ratio/high_max": 1.6189535017474554e-05, + "clip_ratio/high_mean": 4.047383754368639e-06, + "clip_ratio/low_mean": 3.127787306311802e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.532525670379982e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8561.109375, + "completions/mean_terminated_length": 7969.79052734375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.9581378549337387, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016026750672608614, + "learning_rate": 1e-05, + "loss": 0.0131, + "num_tokens": 222399046.0, + "reward": 0.34375, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 1.653693971093162e-06, + "sampling/sampling_logp_difference/max": 13.312499046325684, + "sampling/sampling_logp_difference/mean": 0.02173236384987831, + "step": 259 + }, + { + "clip_ratio/high_max": 1.4200771602190798e-05, + "clip_ratio/high_mean": 4.3255887476334465e-06, + "clip_ratio/low_mean": 5.2955770115659107e-05, + "clip_ratio/low_min": 3.402656830076012e-06, + "clip_ratio/region_mean": 5.7281358749605715e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16239.0, + "completions/mean_length": 7152.34375, + "completions/mean_terminated_length": 7079.6533203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9052041247487068, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005460259038954973, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 223335010.0, + "reward": 0.4296875, + "reward_std": 0.3356297016143799, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966621398926, + "sampling/importance_sampling_ratio/min": 0.010161337442696095, + "sampling/sampling_logp_difference/max": 4.589165210723877, + "sampling/sampling_logp_difference/mean": 0.01986619457602501, + "step": 260 + }, + { + "clip_ratio/high_max": 1.4350314813782461e-05, + "clip_ratio/high_mean": 3.5875787034456152e-06, + "clip_ratio/low_mean": 3.81288905373367e-05, + "clip_ratio/low_min": 8.099272235995159e-06, + "clip_ratio/region_mean": 4.1716469809216505e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 6678.65625, + "completions/mean_terminated_length": 6524.603515625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.9043187350034714, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005933742038905621, + "learning_rate": 1e-05, + "loss": 0.0966, + "num_tokens": 224207006.0, + "reward": 0.484375, + "reward_std": 0.3316681981086731, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000031590461731, + "sampling/importance_sampling_ratio/min": 0.0011734943836927414, + "sampling/sampling_logp_difference/max": 6.747769355773926, + "sampling/sampling_logp_difference/mean": 0.019827336072921753, + "step": 261 + }, + { + "clip_ratio/high_max": 1.6498819377375185e-05, + "clip_ratio/high_mean": 4.124704844343796e-06, + "clip_ratio/low_mean": 3.601791678420341e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.014262168539062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15972.0, + "completions/mean_length": 6999.0390625, + "completions/mean_terminated_length": 6850.07177734375, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8109970837831497, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003635740838944912, + "learning_rate": 1e-05, + "loss": 0.104, + "num_tokens": 225122891.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999303817749023, + "sampling/importance_sampling_ratio/min": 1.6911570128286257e-05, + "sampling/sampling_logp_difference/max": 10.987512588500977, + "sampling/sampling_logp_difference/mean": 0.018912551924586296, + "step": 262 + }, + { + "clip_ratio/high_max": 9.527577958579059e-06, + "clip_ratio/high_mean": 2.3818944896447647e-06, + "clip_ratio/low_mean": 3.766565987461945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.004755419373396e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15713.0, + "completions/mean_length": 7483.7109375, + "completions/mean_terminated_length": 7045.9912109375, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "entropy": 0.9473970532417297, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003405241761356592, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 226102462.0, + "reward": 0.4453125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00002920627594, + "sampling/importance_sampling_ratio/min": 0.00525119062513113, + "sampling/sampling_logp_difference/max": 5.249300479888916, + "sampling/sampling_logp_difference/mean": 0.021076779812574387, + "step": 263 + }, + { + "clip_ratio/high_max": 1.5867321963014547e-05, + "clip_ratio/high_mean": 3.966830490753637e-06, + "clip_ratio/low_mean": 3.8259706570897833e-05, + "clip_ratio/low_min": 3.549019083948224e-06, + "clip_ratio/region_mean": 4.2226537743772496e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 7569.03125, + "completions/mean_terminated_length": 7357.47216796875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9231455475091934, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0025927501264959574, + "learning_rate": 1e-05, + "loss": 0.0801, + "num_tokens": 227093562.0, + "reward": 0.3984375, + "reward_std": 0.19097033143043518, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0052477638237178326, + "sampling/sampling_logp_difference/max": 5.249953269958496, + "sampling/sampling_logp_difference/mean": 0.020578444004058838, + "step": 264 + }, + { + "clip_ratio/high_max": 1.344091060673236e-05, + "clip_ratio/high_mean": 3.36022765168309e-06, + "clip_ratio/low_mean": 4.253613235505327e-05, + "clip_ratio/low_min": 3.5579084851633525e-06, + "clip_ratio/region_mean": 4.5896360120423196e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 7589.2734375, + "completions/mean_terminated_length": 7378.2001953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.9265239909291267, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030512227676808834, + "learning_rate": 1e-05, + "loss": 0.04, + "num_tokens": 228086405.0, + "reward": 0.4296875, + "reward_std": 0.27905434370040894, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.0002165911573683843, + "sampling/sampling_logp_difference/max": 8.437499046325684, + "sampling/sampling_logp_difference/mean": 0.020208362489938736, + "step": 265 + }, + { + "clip_ratio/high_max": 1.9613525410022703e-05, + "clip_ratio/high_mean": 4.903381352505676e-06, + "clip_ratio/low_mean": 3.184792547017423e-05, + "clip_ratio/low_min": 7.29296516510658e-06, + "clip_ratio/region_mean": 3.675130722058384e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 8420.6875, + "completions/mean_terminated_length": 8096.97509765625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "entropy": 0.9572964608669281, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022430522367358208, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 229183765.0, + "reward": 0.34375, + "reward_std": 0.309583842754364, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 0.00029693738906644285, + "sampling/sampling_logp_difference/max": 8.121989250183105, + "sampling/sampling_logp_difference/mean": 0.021570362150669098, + "step": 266 + }, + { + "clip_ratio/high_max": 6.728750577167375e-06, + "clip_ratio/high_mean": 1.6821876442918438e-06, + "clip_ratio/low_mean": 2.1682553096979973e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.336474062758498e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15736.0, + "completions/mean_length": 6809.765625, + "completions/mean_terminated_length": 6579.984375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.884086549282074, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004295065999031067, + "learning_rate": 1e-05, + "loss": 0.1058, + "num_tokens": 230077607.0, + "reward": 0.484375, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294281005859, + "sampling/importance_sampling_ratio/min": 0.00754612497985363, + "sampling/sampling_logp_difference/max": 4.886721134185791, + "sampling/sampling_logp_difference/mean": 0.019895706325769424, + "step": 267 + }, + { + "clip_ratio/high_max": 2.8609347509700456e-05, + "clip_ratio/high_mean": 7.152336877425114e-06, + "clip_ratio/low_mean": 5.158006410965754e-05, + "clip_ratio/low_min": 5.210069957684027e-06, + "clip_ratio/region_mean": 5.873240070286556e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15080.0, + "completions/mean_length": 7340.6953125, + "completions/mean_terminated_length": 6973.0810546875, + "completions/min_length": 1616.0, + "completions/min_terminated_length": 1616.0, + "entropy": 0.9920620769262314, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004631794057786465, + "learning_rate": 1e-05, + "loss": 0.0096, + "num_tokens": 231035616.0, + "reward": 0.4375, + "reward_std": 0.3235401213169098, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.0002508950710762292, + "sampling/sampling_logp_difference/max": 8.290475845336914, + "sampling/sampling_logp_difference/mean": 0.020591016858816147, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.3085940774290066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3085940774290066e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14120.0, + "completions/mean_length": 6748.875, + "completions/mean_terminated_length": 6595.93701171875, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.9867061004042625, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035752104595303535, + "learning_rate": 1e-05, + "loss": 0.0455, + "num_tokens": 231920056.0, + "reward": 0.40625, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.0003869794018100947, + "sampling/sampling_logp_difference/max": 7.8571391105651855, + "sampling/sampling_logp_difference/mean": 0.02061416581273079, + "step": 269 + }, + { + "clip_ratio/high_max": 1.2506750408647349e-05, + "clip_ratio/high_mean": 3.1266876021618373e-06, + "clip_ratio/low_mean": 3.10397430212106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.416643085074611e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 7260.3046875, + "completions/mean_terminated_length": 7188.46435546875, + "completions/min_length": 1384.0, + "completions/min_terminated_length": 1384.0, + "entropy": 1.0388494208455086, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036644963547587395, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 232869159.0, + "reward": 0.390625, + "reward_std": 0.2359209954738617, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999546408653259, + "sampling/importance_sampling_ratio/min": 0.0008660226594656706, + "sampling/sampling_logp_difference/max": 7.051599502563477, + "sampling/sampling_logp_difference/mean": 0.02120530977845192, + "step": 270 + }, + { + "clip_ratio/high_max": 2.704355301830219e-05, + "clip_ratio/high_mean": 6.760888254575548e-06, + "clip_ratio/low_mean": 3.1861192269388994e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.862208097871189e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16073.0, + "completions/max_terminated_length": 16073.0, + "completions/mean_length": 6354.4609375, + "completions/mean_terminated_length": 6354.4609375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "entropy": 0.8405331820249557, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004709267523139715, + "learning_rate": 1e-05, + "loss": 0.0039, + "num_tokens": 233702842.0, + "reward": 0.546875, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 0.0046309432946145535, + "sampling/sampling_logp_difference/max": 5.37499475479126, + "sampling/sampling_logp_difference/mean": 0.019126038998365402, + "step": 271 + }, + { + "clip_ratio/high_max": 9.749228638611385e-06, + "clip_ratio/high_mean": 2.437307159652846e-06, + "clip_ratio/low_mean": 3.855073941849696e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.098804652130639e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6514.578125, + "completions/mean_terminated_length": 6357.9208984375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 1.0254098922014236, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003066045930609107, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 234556348.0, + "reward": 0.4375, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805092811584, + "sampling/importance_sampling_ratio/min": 0.005210204049944878, + "sampling/sampling_logp_difference/max": 5.257136344909668, + "sampling/sampling_logp_difference/mean": 0.019960148259997368, + "step": 272 + }, + { + "clip_ratio/high_max": 1.0475813724042382e-05, + "clip_ratio/high_mean": 2.6189534310105955e-06, + "clip_ratio/low_mean": 3.487835761006863e-05, + "clip_ratio/low_min": 2.9392399483185727e-06, + "clip_ratio/region_mean": 3.749731081370555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 7379.5546875, + "completions/mean_terminated_length": 7236.62744140625, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 1.0397320613265038, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005132520105689764, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 235521091.0, + "reward": 0.2890625, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999256134033203, + "sampling/importance_sampling_ratio/min": 0.00016659013635944575, + "sampling/sampling_logp_difference/max": 8.699974060058594, + "sampling/sampling_logp_difference/mean": 0.021417103707790375, + "step": 273 + }, + { + "clip_ratio/high_max": 1.9904123973901733e-05, + "clip_ratio/high_mean": 5.776861314643611e-06, + "clip_ratio/low_mean": 2.6659268655748747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2436129686175263e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14565.0, + "completions/mean_length": 7837.1640625, + "completions/mean_terminated_length": 7632.04052734375, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "entropy": 0.8400963917374611, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028969801496714354, + "learning_rate": 1e-05, + "loss": 0.0143, + "num_tokens": 236544160.0, + "reward": 0.3828125, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887943267822, + "sampling/importance_sampling_ratio/min": 2.883308241052873e-07, + "sampling/sampling_logp_difference/max": 15.059157371520996, + "sampling/sampling_logp_difference/mean": 0.019267702475190163, + "step": 274 + }, + { + "clip_ratio/high_max": 8.562770290154731e-06, + "clip_ratio/high_mean": 2.1406925725386827e-06, + "clip_ratio/low_mean": 4.060094340729847e-05, + "clip_ratio/low_min": 3.8700886761944275e-06, + "clip_ratio/region_mean": 4.2741635979837156e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15350.0, + "completions/mean_length": 6696.3515625, + "completions/mean_terminated_length": 6542.57958984375, + "completions/min_length": 1239.0, + "completions/min_terminated_length": 1239.0, + "entropy": 0.8495818004012108, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003412836929783225, + "learning_rate": 1e-05, + "loss": 0.0803, + "num_tokens": 237423101.0, + "reward": 0.515625, + "reward_std": 0.37981897592544556, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.012152798473834991, + "sampling/sampling_logp_difference/max": 4.410195827484131, + "sampling/sampling_logp_difference/mean": 0.018458625301718712, + "step": 275 + }, + { + "clip_ratio/high_max": 1.1463653436294408e-05, + "clip_ratio/high_mean": 3.646129641765583e-06, + "clip_ratio/low_mean": 6.144847083078275e-05, + "clip_ratio/low_min": 1.110105540647055e-05, + "clip_ratio/region_mean": 6.509460160941671e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15666.0, + "completions/mean_length": 7700.3671875, + "completions/mean_terminated_length": 7121.45849609375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.8258870914578438, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024443145375698805, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 238429956.0, + "reward": 0.375, + "reward_std": 0.2872493863105774, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999113082885742, + "sampling/importance_sampling_ratio/min": 0.00026112530031241477, + "sampling/sampling_logp_difference/max": 8.250510215759277, + "sampling/sampling_logp_difference/mean": 0.019427984952926636, + "step": 276 + }, + { + "clip_ratio/high_max": 4.218127742205979e-06, + "clip_ratio/high_mean": 1.0545319355514948e-06, + "clip_ratio/low_mean": 1.7289162997258245e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.834369493280974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16112.0, + "completions/mean_length": 6255.21875, + "completions/mean_terminated_length": 6094.44482421875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.8179014846682549, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022747826296836138, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 239250160.0, + "reward": 0.5234375, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.0002633975527714938, + "sampling/sampling_logp_difference/max": 8.241846084594727, + "sampling/sampling_logp_difference/mean": 0.018723051995038986, + "step": 277 + }, + { + "clip_ratio/high_max": 1.698448841125355e-05, + "clip_ratio/high_mean": 5.369374321162468e-06, + "clip_ratio/low_mean": 6.14647315160255e-05, + "clip_ratio/low_min": 5.043576493335422e-06, + "clip_ratio/region_mean": 6.683410583718796e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15321.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 6914.9609375, + "completions/mean_terminated_length": 6914.9609375, + "completions/min_length": 730.0, + "completions/min_terminated_length": 730.0, + "entropy": 0.9700981751084328, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005685295443981886, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 240156211.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 4.5509867049986497e-05, + "sampling/sampling_logp_difference/max": 9.997581481933594, + "sampling/sampling_logp_difference/mean": 0.021195171400904655, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9186837764427764e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9186837764427764e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15469.0, + "completions/mean_length": 5227.53125, + "completions/mean_terminated_length": 5139.68505859375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "entropy": 0.9116031974554062, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003880272386595607, + "learning_rate": 1e-05, + "loss": 0.1246, + "num_tokens": 240845295.0, + "reward": 0.6328125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.6328125, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000362396240234, + "sampling/importance_sampling_ratio/min": 0.00012422871077433228, + "sampling/sampling_logp_difference/max": 8.993386268615723, + "sampling/sampling_logp_difference/mean": 0.018801718950271606, + "step": 279 + }, + { + "clip_ratio/high_max": 2.5015486926349695e-05, + "clip_ratio/high_mean": 8.084949570275057e-06, + "clip_ratio/low_mean": 5.524710468307603e-05, + "clip_ratio/low_min": 3.776891389861703e-06, + "clip_ratio/region_mean": 6.333205465125502e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 8065.4765625, + "completions/mean_terminated_length": 7510.90869140625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.7446574792265892, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0028986844699829817, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 241895676.0, + "reward": 0.4921875, + "reward_std": 0.3474721610546112, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.0017039099475368857, + "sampling/sampling_logp_difference/max": 6.3748297691345215, + "sampling/sampling_logp_difference/mean": 0.01853121444582939, + "step": 280 + }, + { + "clip_ratio/high_max": 9.486341014053323e-06, + "clip_ratio/high_mean": 2.371585253513331e-06, + "clip_ratio/low_mean": 2.896106741445692e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.133265261112683e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15534.0, + "completions/max_terminated_length": 15534.0, + "completions/mean_length": 6127.359375, + "completions/mean_terminated_length": 6127.359375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.8569132760167122, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003845847910270095, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 242698258.0, + "reward": 0.53125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000942945480347, + "sampling/importance_sampling_ratio/min": 0.00043231461313553154, + "sampling/sampling_logp_difference/max": 7.746356964111328, + "sampling/sampling_logp_difference/mean": 0.01856958493590355, + "step": 281 + }, + { + "clip_ratio/high_max": 2.9848330086679198e-05, + "clip_ratio/high_mean": 7.4620825216697995e-06, + "clip_ratio/low_mean": 4.3558867673709756e-05, + "clip_ratio/low_min": 4.417741820361698e-06, + "clip_ratio/region_mean": 5.1020949285884853e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15192.0, + "completions/mean_length": 6600.1484375, + "completions/mean_terminated_length": 6365.33642578125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.78924310952425, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003953634761273861, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 243560957.0, + "reward": 0.5546875, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.0006525487406179309, + "sampling/sampling_logp_difference/max": 7.334624767303467, + "sampling/sampling_logp_difference/mean": 0.018097909167408943, + "step": 282 + }, + { + "clip_ratio/high_max": 6.635561703660642e-06, + "clip_ratio/high_mean": 1.6588904259151604e-06, + "clip_ratio/low_mean": 2.737523408313791e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9034124281679397e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7852.171875, + "completions/mean_terminated_length": 7852.171875, + "completions/min_length": 1276.0, + "completions/min_terminated_length": 1276.0, + "entropy": 1.0598893761634827, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00360781978815794, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 244585923.0, + "reward": 0.3125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 4.2073770600836724e-05, + "sampling/sampling_logp_difference/max": 10.076086044311523, + "sampling/sampling_logp_difference/mean": 0.022330068051815033, + "step": 283 + }, + { + "clip_ratio/high_max": 3.1540168947685743e-06, + "clip_ratio/high_mean": 7.885042236921436e-07, + "clip_ratio/low_mean": 4.7973388973332476e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.876189268543385e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7972.2265625, + "completions/mean_terminated_length": 7700.87890625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.933217465877533, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0027661293279379606, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 245628064.0, + "reward": 0.28125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 3.1466843211092055e-05, + "sampling/sampling_logp_difference/max": 10.366576194763184, + "sampling/sampling_logp_difference/mean": 0.021125148981809616, + "step": 284 + }, + { + "clip_ratio/high_max": 1.2965969062861404e-05, + "clip_ratio/high_mean": 3.241492265715351e-06, + "clip_ratio/low_mean": 4.6317693090713874e-05, + "clip_ratio/low_min": 3.820877282123547e-06, + "clip_ratio/region_mean": 4.955918507221213e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15744.0, + "completions/mean_length": 7135.6953125, + "completions/mean_terminated_length": 6913.736328125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.7786942347884178, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005680318456143141, + "learning_rate": 1e-05, + "loss": 0.0786, + "num_tokens": 246561329.0, + "reward": 0.4296875, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462366104126, + "sampling/importance_sampling_ratio/min": 5.9032357967225835e-05, + "sampling/sampling_logp_difference/max": 9.737424850463867, + "sampling/sampling_logp_difference/mean": 0.018504241481423378, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.22437145175536e-05, + "clip_ratio/low_min": 1.4025082009538892e-05, + "clip_ratio/region_mean": 4.22437145175536e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 6704.046875, + "completions/mean_terminated_length": 6627.82666015625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 1.0435140281915665, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026402862276881933, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 247437415.0, + "reward": 0.3828125, + "reward_std": 0.31276631355285645, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.0007800163584761322, + "sampling/sampling_logp_difference/max": 7.156195640563965, + "sampling/sampling_logp_difference/mean": 0.02134273201227188, + "step": 286 + }, + { + "clip_ratio/high_max": 2.223430897174694e-05, + "clip_ratio/high_mean": 6.8746438159905665e-06, + "clip_ratio/low_mean": 4.7084630978133646e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3959275192028144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 5892.5078125, + "completions/mean_terminated_length": 5725.9765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.8004944771528244, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003993614576756954, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 248211112.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0024652592837810516, + "sampling/sampling_logp_difference/max": 6.005458354949951, + "sampling/sampling_logp_difference/mean": 0.01924925297498703, + "step": 287 + }, + { + "clip_ratio/high_max": 2.1833082200828358e-05, + "clip_ratio/high_mean": 5.458270550207089e-06, + "clip_ratio/low_mean": 3.415995615796419e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.961822596920683e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 7812.140625, + "completions/mean_terminated_length": 7316.24755859375, + "completions/min_length": 1515.0, + "completions/min_terminated_length": 1515.0, + "entropy": 0.8841542899608612, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001573400106281042, + "learning_rate": 1e-05, + "loss": 0.0823, + "num_tokens": 249228106.0, + "reward": 0.4765625, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 0.001001527882181108, + "sampling/sampling_logp_difference/max": 6.906228542327881, + "sampling/sampling_logp_difference/mean": 0.01956877112388611, + "step": 288 + }, + { + "clip_ratio/high_max": 1.014439021673752e-05, + "clip_ratio/high_mean": 2.53609755418438e-06, + "clip_ratio/low_mean": 3.068193461785995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.321803217204433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 6372.953125, + "completions/mean_terminated_length": 6132.6884765625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.8228401988744736, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021125099156051874, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 250063284.0, + "reward": 0.5, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 4.8329173296224326e-05, + "sampling/sampling_logp_difference/max": 9.937475204467773, + "sampling/sampling_logp_difference/mean": 0.01943521574139595, + "step": 289 + }, + { + "clip_ratio/high_max": 7.023906164249638e-06, + "clip_ratio/high_mean": 1.7559765410624095e-06, + "clip_ratio/low_mean": 2.526416994896863e-05, + "clip_ratio/low_min": 6.7760895490209805e-06, + "clip_ratio/region_mean": 2.7020146660561295e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16270.0, + "completions/mean_length": 7817.8671875, + "completions/mean_terminated_length": 7396.58154296875, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.9454319775104523, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022315154783427715, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 251085123.0, + "reward": 0.40625, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 2.8720330647047376e-06, + "sampling/sampling_logp_difference/max": 12.760490417480469, + "sampling/sampling_logp_difference/mean": 0.021764669567346573, + "step": 290 + }, + { + "clip_ratio/high_max": 1.4797966287005693e-05, + "clip_ratio/high_mean": 3.699491571751423e-06, + "clip_ratio/low_mean": 4.36271948274225e-05, + "clip_ratio/low_min": 3.6957101201551268e-06, + "clip_ratio/region_mean": 4.732668639917392e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 7168.4921875, + "completions/mean_terminated_length": 6635.36328125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.8433891162276268, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004663965664803982, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 252020906.0, + "reward": 0.5546875, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.0003851866349577904, + "sampling/sampling_logp_difference/max": 7.861782550811768, + "sampling/sampling_logp_difference/mean": 0.01929781585931778, + "step": 291 + }, + { + "clip_ratio/high_max": 1.996871560550062e-05, + "clip_ratio/high_mean": 6.089093403716106e-06, + "clip_ratio/low_mean": 4.2792244585143635e-05, + "clip_ratio/low_min": 1.0337215371691855e-05, + "clip_ratio/region_mean": 4.8881338216233416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7322.5078125, + "completions/mean_terminated_length": 6876.8603515625, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 0.9157031401991844, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036942458245903254, + "learning_rate": 1e-05, + "loss": 0.079, + "num_tokens": 252977435.0, + "reward": 0.3359375, + "reward_std": 0.24275577068328857, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.00029605376766994596, + "sampling/sampling_logp_difference/max": 8.124969482421875, + "sampling/sampling_logp_difference/mean": 0.0205365102738142, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.631919460327481e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.631919460327481e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16078.0, + "completions/mean_length": 7025.484375, + "completions/mean_terminated_length": 6723.5966796875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 1.1329731941223145, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034127074759453535, + "learning_rate": 1e-05, + "loss": 0.0227, + "num_tokens": 253896161.0, + "reward": 0.25, + "reward_std": 0.27722424268722534, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0005197672289796174, + "sampling/sampling_logp_difference/max": 7.562129497528076, + "sampling/sampling_logp_difference/mean": 0.023741140961647034, + "step": 293 + }, + { + "clip_ratio/high_max": 4.368643658381188e-06, + "clip_ratio/high_mean": 1.092160914595297e-06, + "clip_ratio/low_mean": 2.4661783299961826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5753944555617636e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13776.0, + "completions/mean_length": 5996.1796875, + "completions/mean_terminated_length": 5661.08837890625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8773328885436058, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003959407564252615, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 254690264.0, + "reward": 0.53125, + "reward_std": 0.26645541191101074, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 1.4735347519945208e-07, + "sampling/sampling_logp_difference/max": 15.73043155670166, + "sampling/sampling_logp_difference/mean": 0.018407585099339485, + "step": 294 + }, + { + "clip_ratio/high_max": 1.616483677935321e-05, + "clip_ratio/high_mean": 4.041209194838302e-06, + "clip_ratio/low_mean": 3.736187466074625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140308453770558e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16383.0, + "completions/mean_length": 7165.328125, + "completions/mean_terminated_length": 6867.951171875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.9502597972750664, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030910037457942963, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 255626394.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000731945037842, + "sampling/importance_sampling_ratio/min": 0.00022311302018351853, + "sampling/sampling_logp_difference/max": 8.407832145690918, + "sampling/sampling_logp_difference/mean": 0.020668907091021538, + "step": 295 + }, + { + "clip_ratio/high_max": 1.1702686606440693e-05, + "clip_ratio/high_mean": 2.9256716516101733e-06, + "clip_ratio/low_mean": 5.5247357522603124e-05, + "clip_ratio/low_min": 3.6811261452385224e-06, + "clip_ratio/region_mean": 5.8173028264718596e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15375.0, + "completions/mean_length": 8001.9296875, + "completions/mean_terminated_length": 7661.34912109375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.8591345250606537, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037233952898532152, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 256673457.0, + "reward": 0.421875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.0021876997780054808, + "sampling/sampling_logp_difference/max": 6.124904632568359, + "sampling/sampling_logp_difference/mean": 0.020540472120046616, + "step": 296 + }, + { + "clip_ratio/high_max": 3.721341136042611e-05, + "clip_ratio/high_mean": 1.2759249216287571e-05, + "clip_ratio/low_mean": 3.570647322703735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.846572301175911e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 6924.84375, + "completions/mean_terminated_length": 6697.82421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.7969356626272202, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006054217461496592, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 257578501.0, + "reward": 0.5078125, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 0.007889713160693645, + "sampling/sampling_logp_difference/max": 4.842195510864258, + "sampling/sampling_logp_difference/mean": 0.019306108355522156, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0211543894911301e-05, + "clip_ratio/high_mean": 2.5528859737278253e-06, + "clip_ratio/low_mean": 5.2388056587915344e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4940942732173426e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14439.0, + "completions/mean_length": 6203.03125, + "completions/mean_terminated_length": 5958.6884765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.8734413683414459, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004903806839138269, + "learning_rate": 1e-05, + "loss": 0.0689, + "num_tokens": 258392625.0, + "reward": 0.4453125, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 0.00020370795391499996, + "sampling/sampling_logp_difference/max": 8.498823165893555, + "sampling/sampling_logp_difference/mean": 0.01909301057457924, + "step": 298 + }, + { + "clip_ratio/high_max": 1.5135058674786706e-05, + "clip_ratio/high_mean": 4.64845766146027e-06, + "clip_ratio/low_mean": 4.373456977191381e-05, + "clip_ratio/low_min": 3.670856358439778e-06, + "clip_ratio/region_mean": 4.8383026296505705e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 7982.5390625, + "completions/mean_terminated_length": 7641.01611328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0091779381036758, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033637424930930138, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 259435270.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999765753746033, + "sampling/importance_sampling_ratio/min": 0.0016514655435457826, + "sampling/sampling_logp_difference/max": 6.406092166900635, + "sampling/sampling_logp_difference/mean": 0.02182736061513424, + "step": 299 + }, + { + "clip_ratio/high_max": 2.3964702677403693e-05, + "clip_ratio/high_mean": 5.991175669350923e-06, + "clip_ratio/low_mean": 5.2442986770984135e-05, + "clip_ratio/low_min": 8.75736759553547e-06, + "clip_ratio/region_mean": 5.843416238349164e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16265.0, + "completions/mean_length": 6915.3125, + "completions/mean_terminated_length": 6688.064453125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.7964543774724007, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052203768864274025, + "learning_rate": 1e-05, + "loss": 0.144, + "num_tokens": 260337614.0, + "reward": 0.46875, + "reward_std": 0.37928223609924316, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 7.032832218101248e-05, + "sampling/sampling_logp_difference/max": 9.562335968017578, + "sampling/sampling_logp_difference/mean": 0.017896221950650215, + "step": 300 + }, + { + "clip_ratio/high_max": 4.458271632756805e-05, + "clip_ratio/high_mean": 1.1145679081892013e-05, + "clip_ratio/low_mean": 6.243192206056847e-05, + "clip_ratio/low_min": 1.2397775662975619e-05, + "clip_ratio/region_mean": 7.357759886872373e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7029.4375, + "completions/mean_terminated_length": 6880.95263671875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.8605096861720085, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005570738110691309, + "learning_rate": 1e-05, + "loss": 0.0984, + "num_tokens": 261254070.0, + "reward": 0.4765625, + "reward_std": 0.3327290117740631, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999494552612305, + "sampling/importance_sampling_ratio/min": 0.0009070249507203698, + "sampling/sampling_logp_difference/max": 7.005340576171875, + "sampling/sampling_logp_difference/mean": 0.01905740052461624, + "step": 301 + }, + { + "clip_ratio/high_max": 3.390461233720998e-05, + "clip_ratio/high_mean": 1.1191766247975465e-05, + "clip_ratio/low_mean": 7.46641262594494e-05, + "clip_ratio/low_min": 5.041745680500753e-06, + "clip_ratio/region_mean": 8.585589102949598e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5858.84375, + "completions/mean_terminated_length": 5606.240234375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.8430554121732712, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004496110137552023, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 262024906.0, + "reward": 0.4453125, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999294877052307, + "sampling/importance_sampling_ratio/min": 0.00040469475788995624, + "sampling/sampling_logp_difference/max": 7.812377452850342, + "sampling/sampling_logp_difference/mean": 0.019225869327783585, + "step": 302 + }, + { + "clip_ratio/high_max": 3.2563955301156966e-06, + "clip_ratio/high_mean": 8.140988825289242e-07, + "clip_ratio/low_mean": 3.7080020149460324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.789411886145899e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15976.0, + "completions/mean_length": 8337.328125, + "completions/mean_terminated_length": 7728.7568359375, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.901745393872261, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00348713924176991, + "learning_rate": 1e-05, + "loss": -0.0002, + "num_tokens": 263110844.0, + "reward": 0.296875, + "reward_std": 0.20805485546588898, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.0022652465850114822, + "sampling/sampling_logp_difference/max": 6.090071678161621, + "sampling/sampling_logp_difference/mean": 0.02157524600625038, + "step": 303 + }, + { + "clip_ratio/high_max": 2.3739744847262045e-05, + "clip_ratio/high_mean": 5.934936211815511e-06, + "clip_ratio/low_mean": 2.823553325015382e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.417046866616147e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 7084.7265625, + "completions/mean_terminated_length": 6381.42041015625, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8265534415841103, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003980033565312624, + "learning_rate": 1e-05, + "loss": 0.0551, + "num_tokens": 264036169.0, + "reward": 0.3984375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673366546631, + "sampling/importance_sampling_ratio/min": 0.00012345099821686745, + "sampling/sampling_logp_difference/max": 8.999666213989258, + "sampling/sampling_logp_difference/mean": 0.018782664090394974, + "step": 304 + }, + { + "clip_ratio/high_max": 1.1745505617000163e-05, + "clip_ratio/high_mean": 3.771558226617344e-06, + "clip_ratio/low_mean": 6.913120819262986e-05, + "clip_ratio/low_min": 2.494283216947224e-05, + "clip_ratio/region_mean": 7.290276607818669e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6543.796875, + "completions/mean_terminated_length": 6543.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8899869695305824, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.006467343773692846, + "learning_rate": 1e-05, + "loss": 0.1139, + "num_tokens": 264892767.0, + "reward": 0.484375, + "reward_std": 0.3934885561466217, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000489950180054, + "sampling/importance_sampling_ratio/min": 9.891482477542013e-05, + "sampling/sampling_logp_difference/max": 9.221251487731934, + "sampling/sampling_logp_difference/mean": 0.02032080665230751, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.395576979732141e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.395576979732141e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16307.0, + "completions/mean_length": 8483.390625, + "completions/mean_terminated_length": 7813.84765625, + "completions/min_length": 1342.0, + "completions/min_terminated_length": 1342.0, + "entropy": 0.9621479511260986, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003174177836626768, + "learning_rate": 1e-05, + "loss": 0.0948, + "num_tokens": 265995697.0, + "reward": 0.3359375, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.0005628522485494614, + "sampling/sampling_logp_difference/max": 7.4824934005737305, + "sampling/sampling_logp_difference/mean": 0.02145479805767536, + "step": 306 + }, + { + "clip_ratio/high_max": 1.2596524811669951e-05, + "clip_ratio/high_mean": 3.149131202917488e-06, + "clip_ratio/low_mean": 3.7911659774181317e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.106079018129094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14985.0, + "completions/mean_length": 7184.578125, + "completions/mean_terminated_length": 6963.79248046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.9993807673454285, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003356153378263116, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 266937707.0, + "reward": 0.3828125, + "reward_std": 0.25566399097442627, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.0017036627978086472, + "sampling/sampling_logp_difference/max": 6.374974727630615, + "sampling/sampling_logp_difference/mean": 0.02204768732190132, + "step": 307 + }, + { + "clip_ratio/high_max": 1.9245163684900035e-05, + "clip_ratio/high_mean": 4.811290921225009e-06, + "clip_ratio/low_mean": 4.8845648166206956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.365693925796222e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16216.0, + "completions/mean_length": 7029.2265625, + "completions/mean_terminated_length": 6727.45947265625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.9139953926205635, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006375293247401714, + "learning_rate": 1e-05, + "loss": 0.0519, + "num_tokens": 267853880.0, + "reward": 0.4765625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.010649868287146091, + "sampling/sampling_logp_difference/max": 4.542207717895508, + "sampling/sampling_logp_difference/mean": 0.020365029573440552, + "step": 308 + }, + { + "clip_ratio/high_max": 4.812504812434781e-06, + "clip_ratio/high_mean": 1.2031262031086953e-06, + "clip_ratio/low_mean": 2.5999243803198624e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.720237000630732e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 6188.0078125, + "completions/mean_terminated_length": 5943.30419921875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.7640773430466652, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003697809297591448, + "learning_rate": 1e-05, + "loss": 0.0733, + "num_tokens": 268665721.0, + "reward": 0.5078125, + "reward_std": 0.20699402689933777, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372363090515, + "sampling/importance_sampling_ratio/min": 0.02927250787615776, + "sampling/sampling_logp_difference/max": 3.531106472015381, + "sampling/sampling_logp_difference/mean": 0.016581017524003983, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1358927824621787e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1358927824621787e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 8128.21875, + "completions/mean_terminated_length": 7861.90283203125, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.8218234181404114, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002286596456542611, + "learning_rate": 1e-05, + "loss": 0.0763, + "num_tokens": 269726181.0, + "reward": 0.375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798536300659, + "sampling/importance_sampling_ratio/min": 2.4969556307041785e-06, + "sampling/sampling_logp_difference/max": 12.90043830871582, + "sampling/sampling_logp_difference/mean": 0.019403984770178795, + "step": 310 + }, + { + "clip_ratio/high_max": 1.4808477317274082e-05, + "clip_ratio/high_mean": 3.7021193293185206e-06, + "clip_ratio/low_mean": 3.0363167581981543e-05, + "clip_ratio/low_min": 6.364238288369961e-06, + "clip_ratio/region_mean": 3.4065286854456645e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16256.0, + "completions/mean_length": 5673.3359375, + "completions/mean_terminated_length": 5503.32568359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.9275510385632515, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00485506234690547, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 270470616.0, + "reward": 0.4921875, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.0009123464697040617, + "sampling/sampling_logp_difference/max": 6.999490737915039, + "sampling/sampling_logp_difference/mean": 0.01881871558725834, + "step": 311 + }, + { + "clip_ratio/high_max": 1.1274602456978755e-05, + "clip_ratio/high_mean": 3.6739949109687586e-06, + "clip_ratio/low_mean": 3.968570712231667e-05, + "clip_ratio/low_min": 3.4213767321489286e-06, + "clip_ratio/region_mean": 4.335970191959859e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 6944.8984375, + "completions/mean_terminated_length": 6795.07177734375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9335741624236107, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005874342750757933, + "learning_rate": 1e-05, + "loss": 0.032, + "num_tokens": 271377723.0, + "reward": 0.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000594854354858, + "sampling/importance_sampling_ratio/min": 4.3191710574319586e-05, + "sampling/sampling_logp_difference/max": 10.049861907958984, + "sampling/sampling_logp_difference/mean": 0.020590776577591896, + "step": 312 + }, + { + "clip_ratio/high_max": 1.264126694877632e-05, + "clip_ratio/high_mean": 3.16031673719408e-06, + "clip_ratio/low_mean": 3.206376845810155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.522408474054828e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15806.0, + "completions/mean_length": 7705.625, + "completions/mean_terminated_length": 7278.8193359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.8491624072194099, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001684082904830575, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 272384891.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 6.605865200981498e-05, + "sampling/sampling_logp_difference/max": 9.624967575073242, + "sampling/sampling_logp_difference/mean": 0.020136822015047073, + "step": 313 + }, + { + "clip_ratio/high_max": 9.772357770998497e-06, + "clip_ratio/high_mean": 2.443089442749624e-06, + "clip_ratio/low_mean": 3.8573590472879005e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101667946088128e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6611.1484375, + "completions/mean_terminated_length": 6534.19677734375, + "completions/min_length": 1116.0, + "completions/min_terminated_length": 1116.0, + "entropy": 0.8867302760481834, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003692191792652011, + "learning_rate": 1e-05, + "loss": 0.1233, + "num_tokens": 273251630.0, + "reward": 0.3984375, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.0031062732450664043, + "sampling/sampling_logp_difference/max": 5.774331569671631, + "sampling/sampling_logp_difference/mean": 0.019237037748098373, + "step": 314 + }, + { + "clip_ratio/high_max": 3.0103737344688852e-05, + "clip_ratio/high_mean": 9.664363972206047e-06, + "clip_ratio/low_mean": 1.7575501146893657e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.723986426644842e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15786.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 6770.46875, + "completions/mean_terminated_length": 6770.46875, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.8252957463264465, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004167635925114155, + "learning_rate": 1e-05, + "loss": -0.0072, + "num_tokens": 274146482.0, + "reward": 0.5703125, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.00010247006866848096, + "sampling/sampling_logp_difference/max": 9.18593978881836, + "sampling/sampling_logp_difference/mean": 0.019684650003910065, + "step": 315 + }, + { + "clip_ratio/high_max": 6.529460733872838e-06, + "clip_ratio/high_mean": 1.6323651834682096e-06, + "clip_ratio/low_mean": 3.877351048231503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.040587566578324e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15827.0, + "completions/mean_length": 8210.859375, + "completions/mean_terminated_length": 7365.36181640625, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.8118235394358635, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030363225378096104, + "learning_rate": 1e-05, + "loss": 0.0531, + "num_tokens": 275214040.0, + "reward": 0.3515625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998943209648132, + "sampling/importance_sampling_ratio/min": 0.002854935359209776, + "sampling/sampling_logp_difference/max": 5.858705997467041, + "sampling/sampling_logp_difference/mean": 0.019275270402431488, + "step": 316 + }, + { + "clip_ratio/high_max": 7.0800629146106075e-06, + "clip_ratio/high_mean": 1.7700157286526519e-06, + "clip_ratio/low_mean": 2.3981688286767167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5751703674359305e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14900.0, + "completions/mean_length": 7072.8828125, + "completions/mean_terminated_length": 6849.41650390625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8018335327506065, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004777858033776283, + "learning_rate": 1e-05, + "loss": 0.0404, + "num_tokens": 276138049.0, + "reward": 0.453125, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368190765381, + "sampling/importance_sampling_ratio/min": 0.0028502768836915493, + "sampling/sampling_logp_difference/max": 5.860339164733887, + "sampling/sampling_logp_difference/mean": 0.01849908009171486, + "step": 317 + }, + { + "clip_ratio/high_max": 2.259368602608447e-05, + "clip_ratio/high_mean": 5.648421506521117e-06, + "clip_ratio/low_mean": 4.28424866640853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.849090737479855e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14447.0, + "completions/mean_length": 5889.8359375, + "completions/mean_terminated_length": 5723.26220703125, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.7976400703191757, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030593445990234613, + "learning_rate": 1e-05, + "loss": 0.1331, + "num_tokens": 276910124.0, + "reward": 0.5859375, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.000139843366923742, + "sampling/sampling_logp_difference/max": 8.874987602233887, + "sampling/sampling_logp_difference/mean": 0.01834402233362198, + "step": 318 + }, + { + "clip_ratio/high_max": 1.4654247024736833e-05, + "clip_ratio/high_mean": 3.663561756184208e-06, + "clip_ratio/low_mean": 2.377464920755301e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7438210736363544e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 7144.265625, + "completions/mean_terminated_length": 6689.85205078125, + "completions/min_length": 1200.0, + "completions/min_terminated_length": 1200.0, + "entropy": 0.8309404999017715, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004245694726705551, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 277843542.0, + "reward": 0.4453125, + "reward_std": 0.24541422724723816, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998534321784973, + "sampling/importance_sampling_ratio/min": 1.0131127055501565e-05, + "sampling/sampling_logp_difference/max": 11.499897956848145, + "sampling/sampling_logp_difference/mean": 0.01875344291329384, + "step": 319 + }, + { + "clip_ratio/high_max": 6.252500952541595e-06, + "clip_ratio/high_mean": 2.241558604509919e-06, + "clip_ratio/low_mean": 4.735765514851664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9599213525652885e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15722.0, + "completions/mean_length": 6779.5234375, + "completions/mean_terminated_length": 6703.8974609375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.9584890529513359, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035574575886130333, + "learning_rate": 1e-05, + "loss": 0.0723, + "num_tokens": 278730129.0, + "reward": 0.3984375, + "reward_std": 0.32825323939323425, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.005792221520096064, + "sampling/sampling_logp_difference/max": 5.151239395141602, + "sampling/sampling_logp_difference/mean": 0.02137477695941925, + "step": 320 + }, + { + "clip_ratio/high_max": 3.2948471016425174e-05, + "clip_ratio/high_mean": 9.518853403278627e-06, + "clip_ratio/low_mean": 2.195712454522436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.14759782895635e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15892.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 5582.9765625, + "completions/mean_terminated_length": 5582.9765625, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.8629376217722893, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037982752546668053, + "learning_rate": 1e-05, + "loss": 0.0331, + "num_tokens": 279462542.0, + "reward": 0.5546875, + "reward_std": 0.3164186477661133, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999780058860779, + "sampling/importance_sampling_ratio/min": 0.0021874974481761456, + "sampling/sampling_logp_difference/max": 6.124997138977051, + "sampling/sampling_logp_difference/mean": 0.01906203106045723, + "step": 321 + }, + { + "clip_ratio/high_max": 1.1029473625967512e-05, + "clip_ratio/high_mean": 2.757368406491878e-06, + "clip_ratio/low_mean": 5.367386921761863e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6431237737797346e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 6942.2578125, + "completions/mean_terminated_length": 6477.90966796875, + "completions/min_length": 1156.0, + "completions/min_terminated_length": 1156.0, + "entropy": 0.8147861957550049, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027678858023136854, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 280370207.0, + "reward": 0.4375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998471736907959, + "sampling/importance_sampling_ratio/min": 0.00023058800434228033, + "sampling/sampling_logp_difference/max": 8.3748779296875, + "sampling/sampling_logp_difference/mean": 0.01940828748047352, + "step": 322 + }, + { + "clip_ratio/high_max": 2.6367894406575942e-05, + "clip_ratio/high_mean": 8.765707434577052e-06, + "clip_ratio/low_mean": 3.232976985145797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.109547796815605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6242.53125, + "completions/mean_terminated_length": 5915.38671875, + "completions/min_length": 1220.0, + "completions/min_terminated_length": 1220.0, + "entropy": 0.878915011882782, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00577945914119482, + "learning_rate": 1e-05, + "loss": 0.0839, + "num_tokens": 281189491.0, + "reward": 0.515625, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 9.611724817659706e-05, + "sampling/sampling_logp_difference/max": 9.2499418258667, + "sampling/sampling_logp_difference/mean": 0.01948760263621807, + "step": 323 + }, + { + "clip_ratio/high_max": 3.50839609382092e-05, + "clip_ratio/high_mean": 1.1664920634757436e-05, + "clip_ratio/low_mean": 1.833109013205103e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9996010880495305e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 7004.015625, + "completions/mean_terminated_length": 6622.71533203125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.7964659407734871, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014128695474937558, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 282103997.0, + "reward": 0.4140625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.0024504722096025944, + "sampling/sampling_logp_difference/max": 6.011474609375, + "sampling/sampling_logp_difference/mean": 0.019019678235054016, + "step": 324 + }, + { + "clip_ratio/high_max": 1.832260545597819e-05, + "clip_ratio/high_mean": 4.580651363994548e-06, + "clip_ratio/low_mean": 5.309064226821647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.767129368905444e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7822.6953125, + "completions/mean_terminated_length": 7546.52392578125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.8571138679981232, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002476039342582226, + "learning_rate": 1e-05, + "loss": 0.0515, + "num_tokens": 283122382.0, + "reward": 0.4609375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.0009774373611435294, + "sampling/sampling_logp_difference/max": 6.930576324462891, + "sampling/sampling_logp_difference/mean": 0.020557202398777008, + "step": 325 + }, + { + "clip_ratio/high_max": 5.738419986300869e-06, + "clip_ratio/high_mean": 1.4346049965752172e-06, + "clip_ratio/low_mean": 4.19679121819172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3402517292179255e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7738.8984375, + "completions/mean_terminated_length": 6844.57763671875, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "entropy": 0.7839021533727646, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005309853237122297, + "learning_rate": 1e-05, + "loss": 0.043, + "num_tokens": 284130081.0, + "reward": 0.5234375, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998971223831177, + "sampling/importance_sampling_ratio/min": 0.0001319014554610476, + "sampling/sampling_logp_difference/max": 8.933455467224121, + "sampling/sampling_logp_difference/mean": 0.01873316988348961, + "step": 326 + }, + { + "clip_ratio/high_max": 1.007085802484653e-05, + "clip_ratio/high_mean": 2.5177145062116324e-06, + "clip_ratio/low_mean": 4.043528815600439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.295300277590286e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15952.0, + "completions/mean_length": 7102.2421875, + "completions/mean_terminated_length": 6954.9130859375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.8530801385641098, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228116944432259, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 285058720.0, + "reward": 0.5078125, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00012956927821505815, + "sampling/sampling_logp_difference/max": 8.951294898986816, + "sampling/sampling_logp_difference/mean": 0.019325006753206253, + "step": 327 + }, + { + "clip_ratio/high_max": 4.06874551117653e-06, + "clip_ratio/high_mean": 1.0171863777941326e-06, + "clip_ratio/low_mean": 3.661125703047219e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.762844340826632e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15594.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6583.4765625, + "completions/mean_terminated_length": 6583.4765625, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 1.021921381354332, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004967439454048872, + "learning_rate": 1e-05, + "loss": 0.0374, + "num_tokens": 285919765.0, + "reward": 0.328125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.016675354912877083, + "sampling/sampling_logp_difference/max": 4.093823432922363, + "sampling/sampling_logp_difference/mean": 0.021393200382590294, + "step": 328 + }, + { + "clip_ratio/high_max": 1.2215251445013564e-05, + "clip_ratio/high_mean": 3.053812861253391e-06, + "clip_ratio/low_mean": 4.05305947879242e-05, + "clip_ratio/low_min": 4.215567059873138e-06, + "clip_ratio/region_mean": 4.358440742180392e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16299.0, + "completions/mean_length": 7770.5859375, + "completions/mean_terminated_length": 7346.97509765625, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "entropy": 1.0466903448104858, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004189736675471067, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 286935512.0, + "reward": 0.3828125, + "reward_std": 0.2369818240404129, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.011683559976518154, + "sampling/sampling_logp_difference/max": 4.449572563171387, + "sampling/sampling_logp_difference/mean": 0.021805983036756516, + "step": 329 + }, + { + "clip_ratio/high_max": 2.0567378214764176e-05, + "clip_ratio/high_mean": 5.141844553691044e-06, + "clip_ratio/low_mean": 1.8177100628236076e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3318944840866607e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15758.0, + "completions/mean_length": 5689.2421875, + "completions/mean_terminated_length": 5432.568359375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.7778806164860725, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0032866497058421373, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 287681943.0, + "reward": 0.640625, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940812587738, + "sampling/importance_sampling_ratio/min": 0.00038077132194302976, + "sampling/sampling_logp_difference/max": 7.873311519622803, + "sampling/sampling_logp_difference/mean": 0.01789461076259613, + "step": 330 + }, + { + "clip_ratio/high_max": 3.109086901531555e-05, + "clip_ratio/high_mean": 7.772717253828887e-06, + "clip_ratio/low_mean": 3.1423560130861006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919627738468989e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13820.0, + "completions/mean_length": 6288.1875, + "completions/mean_terminated_length": 6127.93701171875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.7709921672940254, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023572889622300863, + "learning_rate": 1e-05, + "loss": 0.0746, + "num_tokens": 288506735.0, + "reward": 0.484375, + "reward_std": 0.3066929280757904, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 0.000430915504693985, + "sampling/sampling_logp_difference/max": 7.749598503112793, + "sampling/sampling_logp_difference/mean": 0.017407266423106194, + "step": 331 + }, + { + "clip_ratio/high_max": 3.4638953366084024e-05, + "clip_ratio/high_mean": 9.51674803673086e-06, + "clip_ratio/low_mean": 6.26047980176736e-05, + "clip_ratio/low_min": 5.51267930859467e-06, + "clip_ratio/region_mean": 7.212154741864651e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16318.0, + "completions/mean_length": 6775.0234375, + "completions/mean_terminated_length": 6465.05615234375, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.9338318258523941, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034220058005303144, + "learning_rate": 1e-05, + "loss": 0.0986, + "num_tokens": 289395498.0, + "reward": 0.390625, + "reward_std": 0.34533774852752686, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.0317598432302475, + "sampling/sampling_logp_difference/max": 3.449552536010742, + "sampling/sampling_logp_difference/mean": 0.019930530339479446, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 7.159989991123439e-05, + "clip_ratio/low_min": 1.5592839645250933e-05, + "clip_ratio/region_mean": 7.159989991123439e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 7142.9375, + "completions/mean_terminated_length": 6844.83837890625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.971405878663063, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002513247774913907, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 290329082.0, + "reward": 0.328125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999737739562988, + "sampling/importance_sampling_ratio/min": 3.152207455059397e-07, + "sampling/sampling_logp_difference/max": 14.969992637634277, + "sampling/sampling_logp_difference/mean": 0.022366533055901527, + "step": 333 + }, + { + "clip_ratio/high_max": 1.6507752206962323e-05, + "clip_ratio/high_mean": 4.126938051740581e-06, + "clip_ratio/low_mean": 1.7493430505055585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1620368215735652e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15581.0, + "completions/mean_length": 6412.2109375, + "completions/mean_terminated_length": 6333.69287109375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "entropy": 0.9136044681072235, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0056767817586660385, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 291170133.0, + "reward": 0.421875, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999720454216003, + "sampling/importance_sampling_ratio/min": 0.000458698661532253, + "sampling/sampling_logp_difference/max": 7.687117099761963, + "sampling/sampling_logp_difference/mean": 0.020012658089399338, + "step": 334 + }, + { + "clip_ratio/high_max": 8.26085442895419e-06, + "clip_ratio/high_mean": 2.0652136072385474e-06, + "clip_ratio/low_mean": 3.6938338666914206e-05, + "clip_ratio/low_min": 5.699044777429663e-06, + "clip_ratio/region_mean": 3.900355193309224e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16111.0, + "completions/mean_length": 8066.1015625, + "completions/mean_terminated_length": 7797.7822265625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 1.0789504647254944, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00243841833434999, + "learning_rate": 1e-05, + "loss": 0.0432, + "num_tokens": 292222082.0, + "reward": 0.3046875, + "reward_std": 0.2688046097755432, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999664425849915, + "sampling/importance_sampling_ratio/min": 8.481895929435268e-05, + "sampling/sampling_logp_difference/max": 9.374991416931152, + "sampling/sampling_logp_difference/mean": 0.023650091141462326, + "step": 335 + }, + { + "clip_ratio/high_max": 5.320054697222076e-06, + "clip_ratio/high_mean": 1.330013674305519e-06, + "clip_ratio/low_mean": 1.9117383317279746e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0447396991585265e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15176.0, + "completions/mean_length": 6836.046875, + "completions/mean_terminated_length": 6606.896484375, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 1.218759760260582, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0020856577903032303, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 293115984.0, + "reward": 0.21875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 2.784526441246271e-05, + "sampling/sampling_logp_difference/max": 10.488847732543945, + "sampling/sampling_logp_difference/mean": 0.022012067958712578, + "step": 336 + }, + { + "clip_ratio/high_max": 2.5695502699818462e-05, + "clip_ratio/high_mean": 7.549717793153832e-06, + "clip_ratio/low_mean": 4.6741323160404136e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.429104089671455e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15796.0, + "completions/mean_length": 7501.9921875, + "completions/mean_terminated_length": 7140.9345703125, + "completions/min_length": 1237.0, + "completions/min_terminated_length": 1237.0, + "entropy": 0.8940394818782806, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005163854919373989, + "learning_rate": 1e-05, + "loss": 0.0354, + "num_tokens": 294099503.0, + "reward": 0.328125, + "reward_std": 0.30904707312583923, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999276399612427, + "sampling/importance_sampling_ratio/min": 0.0006545600481331348, + "sampling/sampling_logp_difference/max": 7.331547260284424, + "sampling/sampling_logp_difference/mean": 0.020813245326280594, + "step": 337 + }, + { + "clip_ratio/high_max": 3.1606674838258186e-05, + "clip_ratio/high_mean": 9.45794374729303e-06, + "clip_ratio/low_mean": 4.5567895540443715e-05, + "clip_ratio/low_min": 4.458871444512624e-06, + "clip_ratio/region_mean": 5.502583962879726e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16034.0, + "completions/mean_length": 7204.828125, + "completions/mean_terminated_length": 6908.7255859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.9961872175335884, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029277894645929337, + "learning_rate": 1e-05, + "loss": 0.0963, + "num_tokens": 295042105.0, + "reward": 0.390625, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000677108764648, + "sampling/importance_sampling_ratio/min": 1.8970265955431387e-05, + "sampling/sampling_logp_difference/max": 10.872637748718262, + "sampling/sampling_logp_difference/mean": 0.020187582820653915, + "step": 338 + }, + { + "clip_ratio/high_max": 1.7963964182854397e-05, + "clip_ratio/high_mean": 5.194059781388205e-06, + "clip_ratio/low_mean": 1.8380221035840805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.357428081722901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15856.0, + "completions/mean_length": 6256.859375, + "completions/mean_terminated_length": 6013.80810546875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "entropy": 0.9293600022792816, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032952844630926847, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 295867039.0, + "reward": 0.46875, + "reward_std": 0.24670752882957458, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999649524688721, + "sampling/importance_sampling_ratio/min": 7.995560008566827e-05, + "sampling/sampling_logp_difference/max": 9.434039115905762, + "sampling/sampling_logp_difference/mean": 0.019491540268063545, + "step": 339 + }, + { + "clip_ratio/high_max": 7.577551059512189e-06, + "clip_ratio/high_mean": 1.8943877648780472e-06, + "clip_ratio/low_mean": 2.7479814093567256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9374201631071628e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15412.0, + "completions/mean_length": 7397.84375, + "completions/mean_terminated_length": 7032.552734375, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.8508890569210052, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029417150653898716, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 296832843.0, + "reward": 0.375, + "reward_std": 0.2867125868797302, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000183582305908, + "sampling/importance_sampling_ratio/min": 1.7783446310204454e-05, + "sampling/sampling_logp_difference/max": 10.93724250793457, + "sampling/sampling_logp_difference/mean": 0.01975393109023571, + "step": 340 + }, + { + "clip_ratio/high_max": 3.281225508544594e-05, + "clip_ratio/high_mean": 1.3302957199812226e-05, + "clip_ratio/low_mean": 5.109179869577929e-05, + "clip_ratio/low_min": 6.657612175331451e-06, + "clip_ratio/region_mean": 6.439475532715733e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14983.0, + "completions/mean_length": 6897.765625, + "completions/mean_terminated_length": 6823.07080078125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9046694040298462, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026788609102368355, + "learning_rate": 1e-05, + "loss": 0.0664, + "num_tokens": 297735285.0, + "reward": 0.421875, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.001710799871943891, + "sampling/sampling_logp_difference/max": 6.370794296264648, + "sampling/sampling_logp_difference/mean": 0.020578179508447647, + "step": 341 + }, + { + "clip_ratio/high_max": 1.7319889593636617e-05, + "clip_ratio/high_mean": 5.168538336874917e-06, + "clip_ratio/low_mean": 7.019768918326008e-05, + "clip_ratio/low_min": 2.541147478041239e-05, + "clip_ratio/region_mean": 7.53662266106403e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15525.0, + "completions/mean_length": 6971.9921875, + "completions/mean_terminated_length": 6509.10595703125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "entropy": 0.8658201694488525, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005915141198784113, + "learning_rate": 1e-05, + "loss": 0.0923, + "num_tokens": 298645124.0, + "reward": 0.3984375, + "reward_std": 0.3742823898792267, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999268651008606, + "sampling/importance_sampling_ratio/min": 0.000970841443631798, + "sampling/sampling_logp_difference/max": 6.937347412109375, + "sampling/sampling_logp_difference/mean": 0.01906151883304119, + "step": 342 + }, + { + "clip_ratio/high_max": 1.8332865238335216e-05, + "clip_ratio/high_mean": 4.583216309583804e-06, + "clip_ratio/low_mean": 6.167940273371642e-05, + "clip_ratio/low_min": 5.969151516183047e-06, + "clip_ratio/region_mean": 6.626261847486603e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15054.0, + "completions/mean_length": 6545.6953125, + "completions/mean_terminated_length": 5889.80859375, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.779609851539135, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0032792428974062204, + "learning_rate": 1e-05, + "loss": 0.097, + "num_tokens": 299503781.0, + "reward": 0.609375, + "reward_std": 0.38293448090553284, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999361634254456, + "sampling/importance_sampling_ratio/min": 0.002187495119869709, + "sampling/sampling_logp_difference/max": 6.124998092651367, + "sampling/sampling_logp_difference/mean": 0.017413027584552765, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.46246323235755e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.46246323235755e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15318.0, + "completions/mean_length": 7226.515625, + "completions/mean_terminated_length": 7006.736328125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9573849961161613, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005092279519885778, + "learning_rate": 1e-05, + "loss": 0.1102, + "num_tokens": 300447903.0, + "reward": 0.5390625, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999373555183411, + "sampling/importance_sampling_ratio/min": 0.000627054600045085, + "sampling/sampling_logp_difference/max": 7.374476909637451, + "sampling/sampling_logp_difference/mean": 0.021570835262537003, + "step": 344 + }, + { + "clip_ratio/high_max": 5.487269390869187e-06, + "clip_ratio/high_mean": 1.3718173477172968e-06, + "clip_ratio/low_mean": 4.7280102080549113e-05, + "clip_ratio/low_min": 1.0166083029616857e-05, + "clip_ratio/region_mean": 4.865191931457957e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14967.0, + "completions/mean_length": 5755.171875, + "completions/mean_terminated_length": 5323.10546875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8482184633612633, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005033228080719709, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 301206021.0, + "reward": 0.390625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.0014573346124961972, + "sampling/sampling_logp_difference/max": 6.531146049499512, + "sampling/sampling_logp_difference/mean": 0.018870476633310318, + "step": 345 + }, + { + "clip_ratio/high_max": 5.421346941147931e-06, + "clip_ratio/high_mean": 1.3553367352869827e-06, + "clip_ratio/low_mean": 1.6510994441887306e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.786633117717429e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 7098.7265625, + "completions/mean_terminated_length": 6875.88037109375, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "entropy": 0.87320177257061, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.007659573573619127, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 302133890.0, + "reward": 0.421875, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.0012466582702472806, + "sampling/sampling_logp_difference/max": 6.687288761138916, + "sampling/sampling_logp_difference/mean": 0.019994346424937248, + "step": 346 + }, + { + "clip_ratio/high_max": 1.1556229310372146e-05, + "clip_ratio/high_mean": 2.8890573275930365e-06, + "clip_ratio/low_mean": 3.8744643916288624e-05, + "clip_ratio/low_min": 6.108287834649673e-06, + "clip_ratio/region_mean": 4.1633702039689524e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16139.0, + "completions/mean_length": 6399.96875, + "completions/mean_terminated_length": 6077.90283203125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9481896534562111, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014135175151750445, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 302972566.0, + "reward": 0.4140625, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0025698256213217974, + "sampling/sampling_logp_difference/max": 5.963917255401611, + "sampling/sampling_logp_difference/mean": 0.02073008380830288, + "step": 347 + }, + { + "clip_ratio/high_max": 6.59491388432798e-06, + "clip_ratio/high_mean": 2.545892130001448e-06, + "clip_ratio/low_mean": 4.620846755187813e-05, + "clip_ratio/low_min": 6.243132702365983e-06, + "clip_ratio/region_mean": 4.875435956819274e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 7298.078125, + "completions/mean_terminated_length": 7226.53564453125, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "entropy": 0.8719206526875496, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027898226398974657, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 303925976.0, + "reward": 0.484375, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.005236432887613773, + "sampling/sampling_logp_difference/max": 5.252114772796631, + "sampling/sampling_logp_difference/mean": 0.020944103598594666, + "step": 348 + }, + { + "clip_ratio/high_max": 1.052124343914329e-05, + "clip_ratio/high_mean": 2.6303108597858227e-06, + "clip_ratio/low_mean": 2.010384196182713e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.273415248055244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14980.0, + "completions/mean_length": 5667.0390625, + "completions/mean_terminated_length": 5496.9287109375, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "entropy": 0.8791451379656792, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012764945859089494, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 304675157.0, + "reward": 0.390625, + "reward_std": 0.17965976893901825, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000383853912354, + "sampling/importance_sampling_ratio/min": 5.054428584116977e-06, + "sampling/sampling_logp_difference/max": 12.195245742797852, + "sampling/sampling_logp_difference/mean": 0.018928447738289833, + "step": 349 + }, + { + "clip_ratio/high_max": 9.578045592206763e-06, + "clip_ratio/high_mean": 2.3945113980516908e-06, + "clip_ratio/low_mean": 3.1114799753595435e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.350931149270764e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15354.0, + "completions/max_terminated_length": 15354.0, + "completions/mean_length": 5874.4453125, + "completions/mean_terminated_length": 5874.4453125, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9577538818120956, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00509974779561162, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 305447038.0, + "reward": 0.515625, + "reward_std": 0.24777325987815857, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999423027038574, + "sampling/importance_sampling_ratio/min": 0.004791648127138615, + "sampling/sampling_logp_difference/max": 5.340880870819092, + "sampling/sampling_logp_difference/mean": 0.02114470861852169, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0903062275247066e-05, + "clip_ratio/high_mean": 2.7257655688117666e-06, + "clip_ratio/low_mean": 4.784364205079328e-05, + "clip_ratio/low_min": 3.861600362142781e-06, + "clip_ratio/region_mean": 5.056940744907479e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 6197.5703125, + "completions/mean_terminated_length": 6035.88134765625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.8665244281291962, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0030849494505673647, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 306258023.0, + "reward": 0.515625, + "reward_std": 0.3748064339160919, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998056888580322, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.021017421036958694, + "step": 351 + }, + { + "clip_ratio/high_max": 1.4299712574938894e-05, + "clip_ratio/high_mean": 4.3520980170796975e-06, + "clip_ratio/low_mean": 6.213493452378316e-05, + "clip_ratio/low_min": 1.0056635801447555e-05, + "clip_ratio/region_mean": 6.648703174505499e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 7522.578125, + "completions/mean_terminated_length": 7381.9208984375, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.8185881152749062, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002946985885500908, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 307240305.0, + "reward": 0.3125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.005127199459820986, + "sampling/sampling_logp_difference/max": 5.273195743560791, + "sampling/sampling_logp_difference/mean": 0.01965932548046112, + "step": 352 + }, + { + "clip_ratio/high_max": 1.693051035545068e-05, + "clip_ratio/high_mean": 5.08456730585749e-06, + "clip_ratio/low_mean": 4.2052345861520735e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.713691282631771e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14090.0, + "completions/mean_length": 6403.2265625, + "completions/mean_terminated_length": 6163.6884765625, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "entropy": 0.8359840363264084, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031181599479168653, + "learning_rate": 1e-05, + "loss": 0.072, + "num_tokens": 308079318.0, + "reward": 0.5, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999215602874756, + "sampling/importance_sampling_ratio/min": 6.73715621815063e-05, + "sampling/sampling_logp_difference/max": 9.605287551879883, + "sampling/sampling_logp_difference/mean": 0.01963040418922901, + "step": 353 + }, + { + "clip_ratio/high_max": 1.3988919135954347e-05, + "clip_ratio/high_mean": 3.497229783988587e-06, + "clip_ratio/low_mean": 6.722658486069122e-05, + "clip_ratio/low_min": 1.858519090092159e-05, + "clip_ratio/region_mean": 7.072381458783639e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7954.03125, + "completions/mean_terminated_length": 7751.71240234375, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.905990719795227, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002656223252415657, + "learning_rate": 1e-05, + "loss": 0.1022, + "num_tokens": 309117770.0, + "reward": 0.3828125, + "reward_std": 0.321655809879303, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999536275863647, + "sampling/importance_sampling_ratio/min": 0.0003354826185386628, + "sampling/sampling_logp_difference/max": 7.999940395355225, + "sampling/sampling_logp_difference/mean": 0.020741507411003113, + "step": 354 + }, + { + "clip_ratio/high_max": 1.7610595023143105e-05, + "clip_ratio/high_mean": 4.402648755785776e-06, + "clip_ratio/low_mean": 4.337988764291367e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.778253651238629e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6630.09375, + "completions/mean_terminated_length": 6315.45166015625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.870736837387085, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0060529084876179695, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 309988894.0, + "reward": 0.515625, + "reward_std": 0.2790592312812805, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998822212219238, + "sampling/importance_sampling_ratio/min": 2.2177453502081335e-05, + "sampling/sampling_logp_difference/max": 10.716434478759766, + "sampling/sampling_logp_difference/mean": 0.02060208097100258, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0448093235027045e-05, + "clip_ratio/high_mean": 2.6120233087567613e-06, + "clip_ratio/low_mean": 3.1030769946482906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.364279325523967e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15920.0, + "completions/max_terminated_length": 15920.0, + "completions/mean_length": 6679.6171875, + "completions/mean_terminated_length": 6679.6171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9812518879771233, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00400698184967041, + "learning_rate": 1e-05, + "loss": 0.0605, + "num_tokens": 310864013.0, + "reward": 0.421875, + "reward_std": 0.3295465111732483, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999049305915833, + "sampling/importance_sampling_ratio/min": 0.0020593837834894657, + "sampling/sampling_logp_difference/max": 6.1853485107421875, + "sampling/sampling_logp_difference/mean": 0.02098071575164795, + "step": 356 + }, + { + "clip_ratio/high_max": 2.124982574969181e-05, + "clip_ratio/high_mean": 7.736592579021817e-06, + "clip_ratio/low_mean": 2.900951585615985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.674610888992902e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14541.0, + "completions/mean_length": 5523.796875, + "completions/mean_terminated_length": 5173.4677734375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.9120645374059677, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005929585546255112, + "learning_rate": 1e-05, + "loss": 0.0362, + "num_tokens": 311589987.0, + "reward": 0.4765625, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998446702957153, + "sampling/importance_sampling_ratio/min": 0.0010661041596904397, + "sampling/sampling_logp_difference/max": 6.843744277954102, + "sampling/sampling_logp_difference/mean": 0.019948206841945648, + "step": 357 + }, + { + "clip_ratio/high_max": 2.4486997745043482e-05, + "clip_ratio/high_mean": 8.219769085826556e-06, + "clip_ratio/low_mean": 5.346400575945154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.168377467474784e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15401.0, + "completions/mean_length": 6361.3671875, + "completions/mean_terminated_length": 6282.44873046875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.8044678047299385, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.006622390355914831, + "learning_rate": 1e-05, + "loss": 0.1023, + "num_tokens": 312424034.0, + "reward": 0.5078125, + "reward_std": 0.3724474310874939, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000219345092773, + "sampling/importance_sampling_ratio/min": 0.0003157092141918838, + "sampling/sampling_logp_difference/max": 8.060688972473145, + "sampling/sampling_logp_difference/mean": 0.018907658755779266, + "step": 358 + }, + { + "clip_ratio/high_max": 1.0407376748844399e-05, + "clip_ratio/high_mean": 2.6018441872110998e-06, + "clip_ratio/low_mean": 5.925514369664597e-05, + "clip_ratio/low_min": 1.3324347946763737e-05, + "clip_ratio/region_mean": 6.185698703120579e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15883.0, + "completions/mean_length": 7109.0, + "completions/mean_terminated_length": 7035.96826171875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.9167275875806808, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004639944992959499, + "learning_rate": 1e-05, + "loss": 0.0861, + "num_tokens": 313353346.0, + "reward": 0.4140625, + "reward_std": 0.3826971650123596, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999389052391052, + "sampling/importance_sampling_ratio/min": 0.0019070414127781987, + "sampling/sampling_logp_difference/max": 6.262202262878418, + "sampling/sampling_logp_difference/mean": 0.02155841514468193, + "step": 359 + }, + { + "clip_ratio/high_max": 3.959046694035351e-05, + "clip_ratio/high_mean": 1.0912523691786191e-05, + "clip_ratio/low_mean": 3.3944450819944905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.485697365907981e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6314.2734375, + "completions/mean_terminated_length": 6072.60009765625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.8780038207769394, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007643720600754023, + "learning_rate": 1e-05, + "loss": 0.0873, + "num_tokens": 314180717.0, + "reward": 0.4609375, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999802112579346, + "sampling/importance_sampling_ratio/min": 0.021285315975546837, + "sampling/sampling_logp_difference/max": 3.8497378826141357, + "sampling/sampling_logp_difference/mean": 0.01964358240365982, + "step": 360 + }, + { + "clip_ratio/high_max": 3.065382111344661e-05, + "clip_ratio/high_mean": 9.187473835936544e-06, + "clip_ratio/low_mean": 4.137891801292426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.056639065514901e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6718.2265625, + "completions/mean_terminated_length": 6486.24853515625, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.8326799497008324, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0050973957404494286, + "learning_rate": 1e-05, + "loss": 0.0109, + "num_tokens": 315060842.0, + "reward": 0.5078125, + "reward_std": 0.3521803915500641, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.0009130688849836588, + "sampling/sampling_logp_difference/max": 6.998699188232422, + "sampling/sampling_logp_difference/mean": 0.019501537084579468, + "step": 361 + }, + { + "clip_ratio/high_max": 8.624853762739804e-06, + "clip_ratio/high_mean": 2.156213440684951e-06, + "clip_ratio/low_mean": 1.8797969062234188e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0954182048171788e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 8666.8359375, + "completions/mean_terminated_length": 7941.291015625, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.9526705741882324, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019092690199613571, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 316190325.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999814629554749, + "sampling/importance_sampling_ratio/min": 3.5357668821234256e-05, + "sampling/sampling_logp_difference/max": 10.249995231628418, + "sampling/sampling_logp_difference/mean": 0.02051631174981594, + "step": 362 + }, + { + "clip_ratio/high_max": 2.147400391550036e-05, + "clip_ratio/high_mean": 6.434908300434472e-06, + "clip_ratio/low_mean": 3.521234066283796e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.164724816746457e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15164.0, + "completions/mean_length": 7661.8203125, + "completions/mean_terminated_length": 7002.16015625, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.8322782590985298, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019530428107827902, + "learning_rate": 1e-05, + "loss": 0.0729, + "num_tokens": 317191878.0, + "reward": 0.4609375, + "reward_std": 0.21382391452789307, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 8.546619210392237e-05, + "sampling/sampling_logp_difference/max": 9.367389678955078, + "sampling/sampling_logp_difference/mean": 0.019894573837518692, + "step": 363 + }, + { + "clip_ratio/high_max": 1.9436202364886412e-05, + "clip_ratio/high_mean": 6.089704697842535e-06, + "clip_ratio/low_mean": 4.2698405422925134e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.878810955233348e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15934.0, + "completions/mean_length": 7024.859375, + "completions/mean_terminated_length": 6800.240234375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.794853538274765, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031784537713974714, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 318109004.0, + "reward": 0.4921875, + "reward_std": 0.31800347566604614, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999352693557739, + "sampling/importance_sampling_ratio/min": 0.0002962362195830792, + "sampling/sampling_logp_difference/max": 8.124353408813477, + "sampling/sampling_logp_difference/mean": 0.018519200384616852, + "step": 364 + }, + { + "clip_ratio/high_max": 4.127455667912727e-06, + "clip_ratio/high_mean": 1.0318639169781818e-06, + "clip_ratio/low_mean": 4.342453667049995e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.445640047379129e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 7282.1796875, + "completions/mean_terminated_length": 6912.1865234375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.904067650437355, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005080109462141991, + "learning_rate": 1e-05, + "loss": 0.041, + "num_tokens": 319059075.0, + "reward": 0.4140625, + "reward_std": 0.26539456844329834, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000062108039856, + "sampling/importance_sampling_ratio/min": 0.1194523349404335, + "sampling/sampling_logp_difference/max": 6.136754989624023, + "sampling/sampling_logp_difference/mean": 0.019978653639554977, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.608940076243016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.608940076243016e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15625.0, + "completions/mean_length": 7131.5234375, + "completions/mean_terminated_length": 6596.255859375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.8849587142467499, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022667953744530678, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 319990046.0, + "reward": 0.46875, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0370909757912159, + "sampling/sampling_logp_difference/max": 3.294381618499756, + "sampling/sampling_logp_difference/mean": 0.02037571743130684, + "step": 366 + }, + { + "clip_ratio/high_max": 1.5356635913121863e-05, + "clip_ratio/high_mean": 3.839158978280466e-06, + "clip_ratio/low_mean": 3.4950805911648786e-05, + "clip_ratio/low_min": 4.876336333836662e-06, + "clip_ratio/region_mean": 3.8789965287833184e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16205.0, + "completions/mean_length": 6655.4453125, + "completions/mean_terminated_length": 6578.84228515625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.7417122721672058, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00216497085057199, + "learning_rate": 1e-05, + "loss": 0.0681, + "num_tokens": 320860135.0, + "reward": 0.5625, + "reward_std": 0.3369230031967163, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0005190494703128934, + "sampling/sampling_logp_difference/max": 7.563511371612549, + "sampling/sampling_logp_difference/mean": 0.01771342009305954, + "step": 367 + }, + { + "clip_ratio/high_max": 1.7605634639039636e-05, + "clip_ratio/high_mean": 5.297029474604642e-06, + "clip_ratio/low_mean": 5.688933060810086e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.218636053745286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15849.0, + "completions/mean_length": 7077.1640625, + "completions/mean_terminated_length": 6619.45068359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.8749325424432755, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0028338562697172165, + "learning_rate": 1e-05, + "loss": 0.0643, + "num_tokens": 321783852.0, + "reward": 0.3828125, + "reward_std": 0.2120065838098526, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998220205307007, + "sampling/importance_sampling_ratio/min": 7.83290306571871e-06, + "sampling/sampling_logp_difference/max": 11.757177352905273, + "sampling/sampling_logp_difference/mean": 0.020299233496189117, + "step": 368 + }, + { + "clip_ratio/high_max": 7.301828190975357e-06, + "clip_ratio/high_mean": 1.8254570477438392e-06, + "clip_ratio/low_mean": 5.158197632226802e-05, + "clip_ratio/low_min": 3.735804057214409e-06, + "clip_ratio/region_mean": 5.340743223314348e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6034.296875, + "completions/mean_terminated_length": 5525.294921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.80014718323946, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022897711023688316, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 322572882.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999347925186157, + "sampling/importance_sampling_ratio/min": 0.0004105660773348063, + "sampling/sampling_logp_difference/max": 7.7979736328125, + "sampling/sampling_logp_difference/mean": 0.01858348958194256, + "step": 369 + }, + { + "clip_ratio/high_max": 9.364057859784225e-06, + "clip_ratio/high_mean": 3.351393047523743e-06, + "clip_ratio/low_mean": 4.186752630630508e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5218919240141986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15948.0, + "completions/mean_length": 8172.109375, + "completions/mean_terminated_length": 7838.29248046875, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "entropy": 0.8732693120837212, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003263789461925626, + "learning_rate": 1e-05, + "loss": 0.0356, + "num_tokens": 323640904.0, + "reward": 0.2890625, + "reward_std": 0.3237774670124054, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999354481697083, + "sampling/importance_sampling_ratio/min": 9.27252222027164e-06, + "sampling/sampling_logp_difference/max": 11.588455200195312, + "sampling/sampling_logp_difference/mean": 0.0208889190107584, + "step": 370 + }, + { + "clip_ratio/high_max": 2.0998899799451465e-05, + "clip_ratio/high_mean": 6.692962131182867e-06, + "clip_ratio/low_mean": 4.261424010110204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.930720297124935e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16228.0, + "completions/mean_length": 7699.203125, + "completions/mean_terminated_length": 7419.04833984375, + "completions/min_length": 1225.0, + "completions/min_terminated_length": 1225.0, + "entropy": 0.8296505436301231, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0042716520838439465, + "learning_rate": 1e-05, + "loss": 0.0937, + "num_tokens": 324643858.0, + "reward": 0.4921875, + "reward_std": 0.3090519607067108, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999874234199524, + "sampling/importance_sampling_ratio/min": 0.00022192654432728887, + "sampling/sampling_logp_difference/max": 8.413164138793945, + "sampling/sampling_logp_difference/mean": 0.018926654011011124, + "step": 371 + }, + { + "clip_ratio/high_max": 7.061349151626928e-06, + "clip_ratio/high_mean": 1.765337287906732e-06, + "clip_ratio/low_mean": 4.5005243464402156e-05, + "clip_ratio/low_min": 3.861838649754645e-06, + "clip_ratio/region_mean": 4.6770580411248375e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16364.0, + "completions/max_terminated_length": 16364.0, + "completions/mean_length": 7450.1640625, + "completions/mean_terminated_length": 7450.1640625, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "entropy": 1.0400195196270943, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0033558050636202097, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 325617687.0, + "reward": 0.2578125, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999459385871887, + "sampling/importance_sampling_ratio/min": 0.039920732378959656, + "sampling/sampling_logp_difference/max": 3.2208595275878906, + "sampling/sampling_logp_difference/mean": 0.02249298244714737, + "step": 372 + }, + { + "clip_ratio/high_max": 1.3147802746971138e-05, + "clip_ratio/high_mean": 3.2869506867427845e-06, + "clip_ratio/low_mean": 2.4451034505545977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7737984851228248e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15342.0, + "completions/mean_length": 6799.0703125, + "completions/mean_terminated_length": 6723.5986328125, + "completions/min_length": 1708.0, + "completions/min_terminated_length": 1708.0, + "entropy": 0.9737623482942581, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005797459278255701, + "learning_rate": 1e-05, + "loss": 0.0476, + "num_tokens": 326508384.0, + "reward": 0.3125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999321699142456, + "sampling/importance_sampling_ratio/min": 7.535634836131067e-07, + "sampling/sampling_logp_difference/max": 14.0984525680542, + "sampling/sampling_logp_difference/mean": 0.021543748676776886, + "step": 373 + }, + { + "clip_ratio/high_max": 3.3594023989280686e-06, + "clip_ratio/high_mean": 8.398505997320171e-07, + "clip_ratio/low_mean": 2.3457610382138228e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4297460981870245e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16102.0, + "completions/mean_length": 7034.3671875, + "completions/mean_terminated_length": 6654.30078125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8749603256583214, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002258980879560113, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 327426407.0, + "reward": 0.4609375, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.008719252422451973, + "sampling/sampling_logp_difference/max": 4.742221832275391, + "sampling/sampling_logp_difference/mean": 0.01997346058487892, + "step": 374 + }, + { + "clip_ratio/high_max": 2.823375348270929e-05, + "clip_ratio/high_mean": 7.058438370677322e-06, + "clip_ratio/low_mean": 4.9395109726901865e-05, + "clip_ratio/low_min": 1.636556044104509e-05, + "clip_ratio/region_mean": 5.6453548268109444e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15240.0, + "completions/mean_length": 6623.078125, + "completions/mean_terminated_length": 6388.81640625, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.858784057199955, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002420129720121622, + "learning_rate": 1e-05, + "loss": 0.076, + "num_tokens": 328292985.0, + "reward": 0.4140625, + "reward_std": 0.3077537417411804, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 0.00014900295354891568, + "sampling/sampling_logp_difference/max": 8.811544418334961, + "sampling/sampling_logp_difference/mean": 0.019645996391773224, + "step": 375 + }, + { + "clip_ratio/high_max": 1.8078507309837732e-05, + "clip_ratio/high_mean": 6.468551191574079e-06, + "clip_ratio/low_mean": 4.051302585139638e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.698157727034413e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15229.0, + "completions/mean_length": 5902.4765625, + "completions/mean_terminated_length": 5564.36279296875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.904740035533905, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004107976797968149, + "learning_rate": 1e-05, + "loss": 0.0824, + "num_tokens": 329067006.0, + "reward": 0.5546875, + "reward_std": 0.3945493996143341, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 1.1485875802463852e-05, + "sampling/sampling_logp_difference/max": 11.37439250946045, + "sampling/sampling_logp_difference/mean": 0.019582755863666534, + "step": 376 + }, + { + "clip_ratio/high_max": 2.553658168835682e-05, + "clip_ratio/high_mean": 7.276365181496658e-06, + "clip_ratio/low_mean": 1.7552573126522475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.482893796695862e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14514.0, + "completions/mean_length": 6425.6015625, + "completions/mean_terminated_length": 6267.5322265625, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.964553713798523, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003208522219210863, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 329910691.0, + "reward": 0.359375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999419450759888, + "sampling/importance_sampling_ratio/min": 0.00137569778598845, + "sampling/sampling_logp_difference/max": 6.588794231414795, + "sampling/sampling_logp_difference/mean": 0.021154657006263733, + "step": 377 + }, + { + "clip_ratio/high_max": 6.8712420215888415e-06, + "clip_ratio/high_mean": 1.7178105053972104e-06, + "clip_ratio/low_mean": 4.0991827404468495e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2709637853022286e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 8006.4453125, + "completions/mean_terminated_length": 7594.43408203125, + "completions/min_length": 1235.0, + "completions/min_terminated_length": 1235.0, + "entropy": 0.8980336412787437, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002898421371355653, + "learning_rate": 1e-05, + "loss": 0.0815, + "num_tokens": 330956332.0, + "reward": 0.4296875, + "reward_std": 0.20175684988498688, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998818635940552, + "sampling/importance_sampling_ratio/min": 9.378339746035635e-05, + "sampling/sampling_logp_difference/max": 9.27452278137207, + "sampling/sampling_logp_difference/mean": 0.021021340042352676, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2689344689297286e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2689344689297286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15484.0, + "completions/max_terminated_length": 15484.0, + "completions/mean_length": 7068.828125, + "completions/mean_terminated_length": 7068.828125, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.9865007549524307, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0037063576746731997, + "learning_rate": 1e-05, + "loss": 0.0313, + "num_tokens": 331880918.0, + "reward": 0.3203125, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 0.0001819290773710236, + "sampling/sampling_logp_difference/max": 8.611893653869629, + "sampling/sampling_logp_difference/mean": 0.02072504535317421, + "step": 379 + }, + { + "clip_ratio/high_max": 5.845633268108941e-06, + "clip_ratio/high_mean": 1.4614083170272352e-06, + "clip_ratio/low_mean": 3.207486906831036e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.353627721480734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 7379.390625, + "completions/mean_terminated_length": 7236.4609375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.8977236375212669, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001972826896235347, + "learning_rate": 1e-05, + "loss": 0.0228, + "num_tokens": 332849112.0, + "reward": 0.4140625, + "reward_std": 0.28247418999671936, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 2.820451663865242e-05, + "sampling/sampling_logp_difference/max": 10.476028442382812, + "sampling/sampling_logp_difference/mean": 0.019411223009228706, + "step": 380 + }, + { + "clip_ratio/high_max": 4.875385002378607e-06, + "clip_ratio/high_mean": 1.2188462505946518e-06, + "clip_ratio/low_mean": 2.3530714997832547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.47495612484272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15517.0, + "completions/mean_length": 6867.9609375, + "completions/mean_terminated_length": 6793.03125, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "entropy": 0.9244343340396881, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.006926023401319981, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 333746179.0, + "reward": 0.4140625, + "reward_std": 0.1433562934398651, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999299645423889, + "sampling/importance_sampling_ratio/min": 0.0003875594411510974, + "sampling/sampling_logp_difference/max": 7.8556413650512695, + "sampling/sampling_logp_difference/mean": 0.020311862230300903, + "step": 381 + }, + { + "clip_ratio/high_max": 1.5651628245905158e-05, + "clip_ratio/high_mean": 4.836261211949022e-06, + "clip_ratio/low_mean": 5.268017821435933e-05, + "clip_ratio/low_min": 3.950945028918795e-06, + "clip_ratio/region_mean": 5.751643902840442e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 7525.375, + "completions/mean_terminated_length": 6855.3955078125, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9207312315702438, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0047226278111338615, + "learning_rate": 1e-05, + "loss": 0.0808, + "num_tokens": 334731027.0, + "reward": 0.3359375, + "reward_std": 0.3353874683380127, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999615550041199, + "sampling/importance_sampling_ratio/min": 0.00029753465787507594, + "sampling/sampling_logp_difference/max": 8.119979858398438, + "sampling/sampling_logp_difference/mean": 0.021496692672371864, + "step": 382 + }, + { + "clip_ratio/high_max": 3.815379886873416e-05, + "clip_ratio/high_mean": 9.53844971718354e-06, + "clip_ratio/low_mean": 4.519663821156428e-05, + "clip_ratio/low_min": 2.775434040813707e-06, + "clip_ratio/region_mean": 5.473508826980833e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16251.0, + "completions/mean_length": 6841.0625, + "completions/mean_terminated_length": 6453.13818359375, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.8979457840323448, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004971448332071304, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 335631243.0, + "reward": 0.390625, + "reward_std": 0.2596156895160675, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999934196472168, + "sampling/importance_sampling_ratio/min": 9.655764188210014e-06, + "sampling/sampling_logp_difference/max": 11.547955513000488, + "sampling/sampling_logp_difference/mean": 0.020256079733371735, + "step": 383 + }, + { + "clip_ratio/high_max": 4.162365712545579e-06, + "clip_ratio/high_mean": 1.0405914281363948e-06, + "clip_ratio/low_mean": 3.1563491688757495e-05, + "clip_ratio/low_min": 3.1228139505401487e-06, + "clip_ratio/region_mean": 3.260408311689389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15060.0, + "completions/mean_length": 6919.8046875, + "completions/mean_terminated_length": 6454.35205078125, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.9241961911320686, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038604787550866604, + "learning_rate": 1e-05, + "loss": 0.0262, + "num_tokens": 336537162.0, + "reward": 0.375, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998080730438232, + "sampling/importance_sampling_ratio/min": 0.0009118975722230971, + "sampling/sampling_logp_difference/max": 6.999982833862305, + "sampling/sampling_logp_difference/mean": 0.02030865103006363, + "step": 384 + }, + { + "clip_ratio/high_max": 6.5182248363271356e-06, + "clip_ratio/high_mean": 1.6295562090817839e-06, + "clip_ratio/low_mean": 4.3847362121596234e-05, + "clip_ratio/low_min": 6.294533704931382e-06, + "clip_ratio/region_mean": 4.547691833067802e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15692.0, + "completions/mean_length": 7679.390625, + "completions/mean_terminated_length": 7099.08349609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 1.0165777206420898, + "epoch": 0.35418583256669733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004624314606189728, + "learning_rate": 1e-05, + "loss": 0.0849, + "num_tokens": 337542492.0, + "reward": 0.3046875, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999251961708069, + "sampling/importance_sampling_ratio/min": 5.83546279813163e-05, + "sampling/sampling_logp_difference/max": 9.748971939086914, + "sampling/sampling_logp_difference/mean": 0.02206476218998432, + "step": 385 + }, + { + "clip_ratio/high_max": 6.00499606662197e-06, + "clip_ratio/high_mean": 1.5012490166554926e-06, + "clip_ratio/low_mean": 3.392923713363416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.543048615028965e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15819.0, + "completions/mean_length": 5957.5859375, + "completions/mean_terminated_length": 5792.08740234375, + "completions/min_length": 1705.0, + "completions/min_terminated_length": 1705.0, + "entropy": 0.7705951780080795, + "epoch": 0.35510579576816925, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021966886706650257, + "learning_rate": 1e-05, + "loss": 0.0789, + "num_tokens": 338324279.0, + "reward": 0.53125, + "reward_std": 0.22962790727615356, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999998927116394, + "sampling/importance_sampling_ratio/min": 0.0008041196851991117, + "sampling/sampling_logp_difference/max": 7.125762462615967, + "sampling/sampling_logp_difference/mean": 0.01804077997803688, + "step": 386 + }, + { + "clip_ratio/high_max": 1.5711350215497077e-05, + "clip_ratio/high_mean": 3.927837553874269e-06, + "clip_ratio/low_mean": 5.276240381135722e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.669024130838807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7269.8046875, + "completions/mean_terminated_length": 7198.03955078125, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "entropy": 1.0025205165147781, + "epoch": 0.3560257589696412, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001694107661023736, + "learning_rate": 1e-05, + "loss": 0.134, + "num_tokens": 339274662.0, + "reward": 0.3359375, + "reward_std": 0.30487072467803955, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039769172668, + "sampling/importance_sampling_ratio/min": 0.0015677008777856827, + "sampling/sampling_logp_difference/max": 6.4581451416015625, + "sampling/sampling_logp_difference/mean": 0.021742526441812515, + "step": 387 + }, + { + "clip_ratio/high_max": 7.005848829066963e-06, + "clip_ratio/high_mean": 1.7514622072667407e-06, + "clip_ratio/low_mean": 5.100632029098051e-05, + "clip_ratio/low_min": 8.934973720897688e-06, + "clip_ratio/region_mean": 5.275778244140383e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 7643.8359375, + "completions/mean_terminated_length": 7288.54443359375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "entropy": 0.7936615869402885, + "epoch": 0.35694572217111314, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004587972536683083, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 340272689.0, + "reward": 0.5078125, + "reward_std": 0.35324612259864807, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999613761901855, + "sampling/importance_sampling_ratio/min": 0.0007390327518805861, + "sampling/sampling_logp_difference/max": 7.210168361663818, + "sampling/sampling_logp_difference/mean": 0.01862112432718277, + "step": 388 + }, + { + "clip_ratio/high_max": 1.0522736374696251e-05, + "clip_ratio/high_mean": 2.6306840936740628e-06, + "clip_ratio/low_mean": 2.139122614153166e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4021910121518886e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14401.0, + "completions/mean_length": 7068.734375, + "completions/mean_terminated_length": 6610.60595703125, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "entropy": 0.8858344480395317, + "epoch": 0.3578656853725851, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00245783943682909, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 341195599.0, + "reward": 0.4609375, + "reward_std": 0.21594557166099548, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999957263469696, + "sampling/importance_sampling_ratio/min": 1.526316918898374e-05, + "sampling/sampling_logp_difference/max": 11.090067863464355, + "sampling/sampling_logp_difference/mean": 0.019989900290966034, + "step": 389 + }, + { + "clip_ratio/high_max": 5.272259386401856e-06, + "clip_ratio/high_mean": 1.318064846600464e-06, + "clip_ratio/low_mean": 2.2939096254503966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4257160987417592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15788.0, + "completions/mean_length": 6093.296875, + "completions/mean_terminated_length": 5929.95263671875, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.9640207663178444, + "epoch": 0.35878564857405704, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0067657483741641045, + "learning_rate": 1e-05, + "loss": 0.0181, + "num_tokens": 341993565.0, + "reward": 0.4453125, + "reward_std": 0.12415502220392227, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998992681503296, + "sampling/importance_sampling_ratio/min": 0.010459281504154205, + "sampling/sampling_logp_difference/max": 4.56026554107666, + "sampling/sampling_logp_difference/mean": 0.02037961222231388, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.566248594528588e-05, + "clip_ratio/low_min": 4.402028480399167e-06, + "clip_ratio/region_mean": 4.566248594528588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16170.0, + "completions/max_terminated_length": 16170.0, + "completions/mean_length": 7620.09375, + "completions/mean_terminated_length": 7620.09375, + "completions/min_length": 1076.0, + "completions/min_terminated_length": 1076.0, + "entropy": 0.9773544892668724, + "epoch": 0.35970561177552896, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018817185191437602, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 342990545.0, + "reward": 0.3046875, + "reward_std": 0.18755048513412476, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0006883936002850533, + "sampling/sampling_logp_difference/max": 7.281149864196777, + "sampling/sampling_logp_difference/mean": 0.021528441458940506, + "step": 391 + }, + { + "clip_ratio/high_max": 2.6727505428425502e-05, + "clip_ratio/high_mean": 7.985045499481203e-06, + "clip_ratio/low_mean": 7.762144696243922e-05, + "clip_ratio/low_min": 2.4772080450929934e-05, + "clip_ratio/region_mean": 8.560649303035461e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15053.0, + "completions/mean_length": 6963.984375, + "completions/mean_terminated_length": 6737.904296875, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.9683744385838509, + "epoch": 0.36062557497700093, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0052104732021689415, + "learning_rate": 1e-05, + "loss": 0.087, + "num_tokens": 343898791.0, + "reward": 0.4296875, + "reward_std": 0.3621976971626282, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324679374695, + "sampling/importance_sampling_ratio/min": 0.010815954767167568, + "sampling/sampling_logp_difference/max": 4.526732921600342, + "sampling/sampling_logp_difference/mean": 0.021434593945741653, + "step": 392 + }, + { + "clip_ratio/high_max": 1.3545108686230378e-05, + "clip_ratio/high_mean": 4.365133804640209e-06, + "clip_ratio/low_mean": 2.5377692509209737e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9742826200163108e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15116.0, + "completions/mean_length": 6718.5078125, + "completions/mean_terminated_length": 6642.4013671875, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.9043834507465363, + "epoch": 0.36154553817847285, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005151392426341772, + "learning_rate": 1e-05, + "loss": 0.0085, + "num_tokens": 344779672.0, + "reward": 0.4921875, + "reward_std": 0.251188188791275, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999840497970581, + "sampling/importance_sampling_ratio/min": 0.0024171893019229174, + "sampling/sampling_logp_difference/max": 6.025149822235107, + "sampling/sampling_logp_difference/mean": 0.0201373603194952, + "step": 393 + }, + { + "clip_ratio/high_max": 1.2263486723895767e-05, + "clip_ratio/high_mean": 3.927679188109323e-06, + "clip_ratio/low_mean": 2.739263118201052e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.132031042696326e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16342.0, + "completions/mean_length": 7044.640625, + "completions/mean_terminated_length": 6820.49609375, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "entropy": 0.9017335474491119, + "epoch": 0.3624655013799448, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026606651954352856, + "learning_rate": 1e-05, + "loss": 0.0554, + "num_tokens": 345701722.0, + "reward": 0.3125, + "reward_std": 0.24146249890327454, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 3.8765938370488584e-05, + "sampling/sampling_logp_difference/max": 10.157968521118164, + "sampling/sampling_logp_difference/mean": 0.01981864869594574, + "step": 394 + }, + { + "clip_ratio/high_max": 1.026556356009678e-05, + "clip_ratio/high_mean": 2.566390890024195e-06, + "clip_ratio/low_mean": 4.819571529424138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0762106297952414e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15476.0, + "completions/mean_length": 6031.875, + "completions/mean_terminated_length": 5950.3623046875, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.8537683561444283, + "epoch": 0.36338546458141674, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003957017324864864, + "learning_rate": 1e-05, + "loss": 0.0947, + "num_tokens": 346492810.0, + "reward": 0.4296875, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999707341194153, + "sampling/importance_sampling_ratio/min": 0.0015133036067709327, + "sampling/sampling_logp_difference/max": 6.493460178375244, + "sampling/sampling_logp_difference/mean": 0.018711457028985023, + "step": 395 + }, + { + "clip_ratio/high_max": 5.870488848813693e-06, + "clip_ratio/high_mean": 1.4676222122034233e-06, + "clip_ratio/low_mean": 3.637038832948747e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.783801014378696e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 7429.3515625, + "completions/mean_terminated_length": 6911.31396484375, + "completions/min_length": 1194.0, + "completions/min_terminated_length": 1194.0, + "entropy": 0.8821266070008278, + "epoch": 0.36430542778288866, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002122648525983095, + "learning_rate": 1e-05, + "loss": 0.1257, + "num_tokens": 347462871.0, + "reward": 0.453125, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000076293945312, + "sampling/importance_sampling_ratio/min": 0.00014005196862854064, + "sampling/sampling_logp_difference/max": 8.873497009277344, + "sampling/sampling_logp_difference/mean": 0.01998838409781456, + "step": 396 + }, + { + "clip_ratio/high_max": 1.0663932243915042e-05, + "clip_ratio/high_mean": 2.6659830609787605e-06, + "clip_ratio/low_mean": 6.443337406381033e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.709935701110226e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15761.0, + "completions/mean_length": 7131.7109375, + "completions/mean_terminated_length": 6833.25, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "entropy": 0.8575824722647667, + "epoch": 0.36522539098436063, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002546454081311822, + "learning_rate": 1e-05, + "loss": 0.0676, + "num_tokens": 348395842.0, + "reward": 0.4921875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999964714050293, + "sampling/importance_sampling_ratio/min": 0.0002167800412280485, + "sampling/sampling_logp_difference/max": 8.436627388000488, + "sampling/sampling_logp_difference/mean": 0.0193922221660614, + "step": 397 + }, + { + "clip_ratio/high_max": 3.847337666229578e-06, + "clip_ratio/high_mean": 9.618344165573944e-07, + "clip_ratio/low_mean": 3.932982110654848e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.029165563679271e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16200.0, + "completions/mean_length": 6858.34375, + "completions/mean_terminated_length": 6707.14306640625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.9539813920855522, + "epoch": 0.36614535418583255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00492837093770504, + "learning_rate": 1e-05, + "loss": 0.0818, + "num_tokens": 349292790.0, + "reward": 0.390625, + "reward_std": 0.1949220597743988, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998850226402283, + "sampling/importance_sampling_ratio/min": 0.0011153683299198747, + "sampling/sampling_logp_difference/max": 6.79857063293457, + "sampling/sampling_logp_difference/mean": 0.020318543538451195, + "step": 398 + }, + { + "clip_ratio/high_max": 1.291372609557584e-05, + "clip_ratio/high_mean": 3.22843152389396e-06, + "clip_ratio/low_mean": 3.8245348378040944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1473780811429606e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15261.0, + "completions/mean_length": 7809.984375, + "completions/mean_terminated_length": 7533.40283203125, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.8353303670883179, + "epoch": 0.3670653173873045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004895905964076519, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 350312556.0, + "reward": 0.3203125, + "reward_std": 0.22567616403102875, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999260306358337, + "sampling/importance_sampling_ratio/min": 0.0008417933131568134, + "sampling/sampling_logp_difference/max": 7.0799760818481445, + "sampling/sampling_logp_difference/mean": 0.018754083663225174, + "step": 399 + }, + { + "clip_ratio/high_max": 1.1250081115576904e-05, + "clip_ratio/high_mean": 3.5690324011738994e-06, + "clip_ratio/low_mean": 3.196108968950284e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.553012152224255e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15057.0, + "completions/mean_length": 7194.9296875, + "completions/mean_terminated_length": 6821.39013671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.9744522422552109, + "epoch": 0.36798528058877644, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0032397822942584753, + "learning_rate": 1e-05, + "loss": 0.0402, + "num_tokens": 351252755.0, + "reward": 0.421875, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998766183853149, + "sampling/importance_sampling_ratio/min": 0.00023159870761446655, + "sampling/sampling_logp_difference/max": 8.370504379272461, + "sampling/sampling_logp_difference/mean": 0.02105094864964485, + "step": 400 + }, + { + "clip_ratio/high_max": 6.980455509619787e-06, + "clip_ratio/high_mean": 1.7451138774049468e-06, + "clip_ratio/low_mean": 2.2670621888210007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.441573599298863e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 6836.234375, + "completions/mean_terminated_length": 6607.08837890625, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.9149863049387932, + "epoch": 0.3689052437902484, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031576494220644236, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 352145873.0, + "reward": 0.3671875, + "reward_std": 0.22225630283355713, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266862869263, + "sampling/importance_sampling_ratio/min": 0.0011975533561781049, + "sampling/sampling_logp_difference/max": 6.727474689483643, + "sampling/sampling_logp_difference/mean": 0.020445333793759346, + "step": 401 + }, + { + "clip_ratio/high_max": 2.3557336589874467e-05, + "clip_ratio/high_mean": 5.889334147468617e-06, + "clip_ratio/low_mean": 5.359988131203863e-05, + "clip_ratio/low_min": 1.3856095392839052e-05, + "clip_ratio/region_mean": 5.9489215118446737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 6942.65625, + "completions/mean_terminated_length": 6638.0966796875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.7541583999991417, + "epoch": 0.36982520699172033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003970830701291561, + "learning_rate": 1e-05, + "loss": 0.051, + "num_tokens": 353056405.0, + "reward": 0.453125, + "reward_std": 0.3282659649848938, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 8.399576472584158e-06, + "sampling/sampling_logp_difference/max": 11.687329292297363, + "sampling/sampling_logp_difference/mean": 0.018101349472999573, + "step": 402 + }, + { + "clip_ratio/high_max": 2.6139805413549766e-05, + "clip_ratio/high_mean": 7.517377525800839e-06, + "clip_ratio/low_mean": 1.968103515537223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7198412681173068e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14786.0, + "completions/max_terminated_length": 14786.0, + "completions/mean_length": 6022.1875, + "completions/mean_terminated_length": 6022.1875, + "completions/min_length": 1285.0, + "completions/min_terminated_length": 1285.0, + "entropy": 0.9535745903849602, + "epoch": 0.37074517019319225, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0043656788766384125, + "learning_rate": 1e-05, + "loss": 0.029, + "num_tokens": 353844661.0, + "reward": 0.4140625, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999772310256958, + "sampling/importance_sampling_ratio/min": 0.04981832951307297, + "sampling/sampling_logp_difference/max": 2.9993722438812256, + "sampling/sampling_logp_difference/mean": 0.020655371248722076, + "step": 403 + }, + { + "clip_ratio/high_max": 9.152076700047473e-06, + "clip_ratio/high_mean": 2.9508817647183605e-06, + "clip_ratio/low_mean": 5.21388310517068e-05, + "clip_ratio/low_min": 2.633131089169183e-06, + "clip_ratio/region_mean": 5.508971298695542e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15906.0, + "completions/mean_length": 8068.96875, + "completions/mean_terminated_length": 7869.408203125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.9473539590835571, + "epoch": 0.3716651333946642, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006543307099491358, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 354894689.0, + "reward": 0.2578125, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 6.672408926533535e-05, + "sampling/sampling_logp_difference/max": 9.614944458007812, + "sampling/sampling_logp_difference/mean": 0.021852033212780952, + "step": 404 + }, + { + "clip_ratio/high_max": 2.9619268843816826e-05, + "clip_ratio/high_mean": 7.4048172109542065e-06, + "clip_ratio/low_mean": 5.5152235972855124e-05, + "clip_ratio/low_min": 1.0455875781190116e-05, + "clip_ratio/region_mean": 6.255705375224352e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15748.0, + "completions/mean_length": 5960.1875, + "completions/mean_terminated_length": 5878.1103515625, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 0.9564141109585762, + "epoch": 0.37258509659613614, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003351036459207535, + "learning_rate": 1e-05, + "loss": 0.0293, + "num_tokens": 355677273.0, + "reward": 0.46875, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999220371246338, + "sampling/importance_sampling_ratio/min": 0.0012859756825491786, + "sampling/sampling_logp_difference/max": 6.656237602233887, + "sampling/sampling_logp_difference/mean": 0.021779976785182953, + "step": 405 + }, + { + "clip_ratio/high_max": 7.957685966175632e-06, + "clip_ratio/high_mean": 1.989421491543908e-06, + "clip_ratio/low_mean": 3.758041248147492e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.956983414354909e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15669.0, + "completions/mean_length": 7620.21875, + "completions/mean_terminated_length": 7189.212890625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 1.035948596894741, + "epoch": 0.3735050597976081, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031219006050378084, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 356675829.0, + "reward": 0.296875, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001060962677002, + "sampling/importance_sampling_ratio/min": 0.010141897015273571, + "sampling/sampling_logp_difference/max": 4.591080188751221, + "sampling/sampling_logp_difference/mean": 0.021951109170913696, + "step": 406 + }, + { + "clip_ratio/high_max": 2.286768199155631e-05, + "clip_ratio/high_mean": 5.7169204978890775e-06, + "clip_ratio/low_mean": 3.914574369900947e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.486266482217616e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14038.0, + "completions/mean_length": 5806.0234375, + "completions/mean_terminated_length": 5638.119140625, + "completions/min_length": 1319.0, + "completions/min_terminated_length": 1319.0, + "entropy": 0.8977029845118523, + "epoch": 0.37442502299908004, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002810312667861581, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 357438712.0, + "reward": 0.546875, + "reward_std": 0.22832970321178436, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999280571937561, + "sampling/importance_sampling_ratio/min": 0.0011738575994968414, + "sampling/sampling_logp_difference/max": 6.747459888458252, + "sampling/sampling_logp_difference/mean": 0.01965375244617462, + "step": 407 + }, + { + "clip_ratio/high_max": 1.2219379641464911e-05, + "clip_ratio/high_mean": 3.054844910366228e-06, + "clip_ratio/low_mean": 3.186109779562685e-05, + "clip_ratio/low_min": 4.3511558942554984e-06, + "clip_ratio/region_mean": 3.4915943160740426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15705.0, + "completions/max_terminated_length": 15705.0, + "completions/mean_length": 6537.4609375, + "completions/mean_terminated_length": 6537.4609375, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.9577726796269417, + "epoch": 0.37534498620055196, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004516562446951866, + "learning_rate": 1e-05, + "loss": 0.0517, + "num_tokens": 358296731.0, + "reward": 0.3828125, + "reward_std": 0.1830746978521347, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999170303344727, + "sampling/importance_sampling_ratio/min": 2.384942035860149e-06, + "sampling/sampling_logp_difference/max": 12.946335792541504, + "sampling/sampling_logp_difference/mean": 0.021242395043373108, + "step": 408 + }, + { + "clip_ratio/high_max": 1.4422689218918094e-05, + "clip_ratio/high_mean": 3.6056723047295236e-06, + "clip_ratio/low_mean": 3.026239573955536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3868068385345396e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 7896.671875, + "completions/mean_terminated_length": 7622.88671875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.9163230583071709, + "epoch": 0.37626494940202393, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003542230697348714, + "learning_rate": 1e-05, + "loss": 0.05, + "num_tokens": 359327001.0, + "reward": 0.375, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998560547828674, + "sampling/importance_sampling_ratio/min": 0.00010891625424847007, + "sampling/sampling_logp_difference/max": 9.124931335449219, + "sampling/sampling_logp_difference/mean": 0.020085681229829788, + "step": 409 + }, + { + "clip_ratio/high_max": 1.7827243254942005e-05, + "clip_ratio/high_mean": 5.474494003010477e-06, + "clip_ratio/low_mean": 4.2465159026505717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.793965263161226e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15297.0, + "completions/mean_length": 6728.7109375, + "completions/mean_terminated_length": 6652.68505859375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.9010183215141296, + "epoch": 0.37718491260349585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0035069347359240055, + "learning_rate": 1e-05, + "loss": 0.0518, + "num_tokens": 360208780.0, + "reward": 0.5390625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999571442604065, + "sampling/importance_sampling_ratio/min": 1.4739226571691688e-05, + "sampling/sampling_logp_difference/max": 11.124998092651367, + "sampling/sampling_logp_difference/mean": 0.021022530272603035, + "step": 410 + }, + { + "clip_ratio/high_max": 1.0376989393989788e-05, + "clip_ratio/high_mean": 2.594247348497447e-06, + "clip_ratio/low_mean": 2.8587513156708155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1181759936771414e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 6800.3984375, + "completions/mean_terminated_length": 6491.25, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.8654960840940475, + "epoch": 0.3781048758049678, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033910400234162807, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 361098567.0, + "reward": 0.5625, + "reward_std": 0.2306838035583496, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998576641082764, + "sampling/importance_sampling_ratio/min": 0.001449413481168449, + "sampling/sampling_logp_difference/max": 6.536596298217773, + "sampling/sampling_logp_difference/mean": 0.019660964608192444, + "step": 411 + }, + { + "clip_ratio/high_max": 2.3068858354236e-05, + "clip_ratio/high_mean": 7.792090059410839e-06, + "clip_ratio/low_mean": 5.8515578757578623e-05, + "clip_ratio/low_min": 1.0348648629587842e-05, + "clip_ratio/region_mean": 6.630766870330262e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16373.0, + "completions/mean_length": 7103.4453125, + "completions/mean_terminated_length": 6956.13525390625, + "completions/min_length": 1711.0, + "completions/min_terminated_length": 1711.0, + "entropy": 0.8317076042294502, + "epoch": 0.37902483900643974, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036110079381614923, + "learning_rate": 1e-05, + "loss": 0.0834, + "num_tokens": 362027520.0, + "reward": 0.546875, + "reward_std": 0.33797892928123474, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999338984489441, + "sampling/importance_sampling_ratio/min": 1.0564122931100428e-05, + "sampling/sampling_logp_difference/max": 11.458046913146973, + "sampling/sampling_logp_difference/mean": 0.01939362846314907, + "step": 412 + }, + { + "clip_ratio/high_max": 3.112394779236638e-06, + "clip_ratio/high_mean": 7.780986948091595e-07, + "clip_ratio/low_mean": 5.127149995587388e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.204959859383962e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15830.0, + "completions/mean_length": 7344.9296875, + "completions/mean_terminated_length": 6900.384765625, + "completions/min_length": 1368.0, + "completions/min_terminated_length": 1368.0, + "entropy": 0.8387318029999733, + "epoch": 0.37994480220791166, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002141098491847515, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 362985207.0, + "reward": 0.34375, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999322891235352, + "sampling/importance_sampling_ratio/min": 1.8932745661004446e-05, + "sampling/sampling_logp_difference/max": 10.874617576599121, + "sampling/sampling_logp_difference/mean": 0.01929464004933834, + "step": 413 + }, + { + "clip_ratio/high_max": 5.2602786126954015e-06, + "clip_ratio/high_mean": 1.3150696531738504e-06, + "clip_ratio/low_mean": 1.7854434247510653e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9169503786997666e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16137.0, + "completions/mean_length": 6377.7734375, + "completions/mean_terminated_length": 6218.94482421875, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9732858911156654, + "epoch": 0.38086476540938363, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015244127716869116, + "learning_rate": 1e-05, + "loss": 0.0608, + "num_tokens": 363823914.0, + "reward": 0.4375, + "reward_std": 0.1988610327243805, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999363422393799, + "sampling/importance_sampling_ratio/min": 0.006335465237498283, + "sampling/sampling_logp_difference/max": 5.061592102050781, + "sampling/sampling_logp_difference/mean": 0.020688029006123543, + "step": 414 + }, + { + "clip_ratio/high_max": 2.6195500595349586e-05, + "clip_ratio/high_mean": 6.548875148837396e-06, + "clip_ratio/low_mean": 3.3802934012783226e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035180882056011e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14456.0, + "completions/mean_length": 5599.7890625, + "completions/mean_terminated_length": 5340.96826171875, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8872368410229683, + "epoch": 0.38178472861085555, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002647512126713991, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 364561127.0, + "reward": 0.453125, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999077916145325, + "sampling/importance_sampling_ratio/min": 2.370526999584399e-06, + "sampling/sampling_logp_difference/max": 12.952398300170898, + "sampling/sampling_logp_difference/mean": 0.01878243312239647, + "step": 415 + }, + { + "clip_ratio/high_max": 2.157278959202813e-05, + "clip_ratio/high_mean": 5.3931973980070325e-06, + "clip_ratio/low_mean": 7.215861739950924e-05, + "clip_ratio/low_min": 1.4898997051204788e-05, + "clip_ratio/region_mean": 7.755181559332414e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15905.0, + "completions/mean_length": 7877.2890625, + "completions/mean_terminated_length": 7385.1650390625, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.8416353687644005, + "epoch": 0.3827046918123275, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018051012884825468, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 365590124.0, + "reward": 0.3125, + "reward_std": 0.28407180309295654, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.0004095165350008756, + "sampling/sampling_logp_difference/max": 7.800533294677734, + "sampling/sampling_logp_difference/mean": 0.019809434190392494, + "step": 416 + }, + { + "clip_ratio/high_max": 2.540994637456606e-05, + "clip_ratio/high_mean": 6.352486593641515e-06, + "clip_ratio/low_mean": 4.230594890941575e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8658435844117776e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16083.0, + "completions/mean_length": 6836.7890625, + "completions/mean_terminated_length": 6200.30859375, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "entropy": 0.8647575601935387, + "epoch": 0.38362465501379944, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004550795070827007, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 366486337.0, + "reward": 0.40625, + "reward_std": 0.22620806097984314, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873638153076, + "sampling/importance_sampling_ratio/min": 0.0001089095021598041, + "sampling/sampling_logp_difference/max": 9.124993324279785, + "sampling/sampling_logp_difference/mean": 0.01992485672235489, + "step": 417 + }, + { + "clip_ratio/high_max": 1.1592664577619871e-05, + "clip_ratio/high_mean": 2.8981661444049678e-06, + "clip_ratio/low_mean": 3.5717548257707676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.861571451579948e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16286.0, + "completions/mean_length": 6884.953125, + "completions/mean_terminated_length": 6417.78662109375, + "completions/min_length": 1289.0, + "completions/min_terminated_length": 1289.0, + "entropy": 0.8691708743572235, + "epoch": 0.3845446182152714, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005958946421742439, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 367386163.0, + "reward": 0.5078125, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 9.519772902422119e-06, + "sampling/sampling_logp_difference/max": 11.562139511108398, + "sampling/sampling_logp_difference/mean": 0.019436441361904144, + "step": 418 + }, + { + "clip_ratio/high_max": 2.7658640192385064e-05, + "clip_ratio/high_mean": 8.455849524580117e-06, + "clip_ratio/low_mean": 3.938097847822064e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7836828116487595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15574.0, + "completions/mean_length": 7439.1328125, + "completions/mean_terminated_length": 7150.58837890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.795464999973774, + "epoch": 0.38546458141674333, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00558120384812355, + "learning_rate": 1e-05, + "loss": 0.1918, + "num_tokens": 368357500.0, + "reward": 0.609375, + "reward_std": 0.3795146346092224, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.0001159337698481977, + "sampling/sampling_logp_difference/max": 9.062491416931152, + "sampling/sampling_logp_difference/mean": 0.018824251368641853, + "step": 419 + }, + { + "clip_ratio/high_max": 8.509555527780321e-06, + "clip_ratio/high_mean": 2.1273888819450804e-06, + "clip_ratio/low_mean": 3.0958593640662e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.308598269313734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16236.0, + "completions/mean_length": 6751.53125, + "completions/mean_terminated_length": 6520.3525390625, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "entropy": 0.9450879693031311, + "epoch": 0.38638454461821525, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004628168884664774, + "learning_rate": 1e-05, + "loss": 0.0859, + "num_tokens": 369242920.0, + "reward": 0.359375, + "reward_std": 0.20859163999557495, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.0006074689445085824, + "sampling/sampling_logp_difference/max": 7.406209468841553, + "sampling/sampling_logp_difference/mean": 0.019376013427972794, + "step": 420 + }, + { + "clip_ratio/high_max": 1.8288420505996328e-05, + "clip_ratio/high_mean": 4.572105126499082e-06, + "clip_ratio/low_mean": 4.86290555272717e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320115997164976e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16164.0, + "completions/mean_length": 7023.296875, + "completions/mean_terminated_length": 6315.3447265625, + "completions/min_length": 1628.0, + "completions/min_terminated_length": 1628.0, + "entropy": 0.7378111630678177, + "epoch": 0.3873045078196872, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00389425759203732, + "learning_rate": 1e-05, + "loss": 0.1066, + "num_tokens": 370159510.0, + "reward": 0.4921875, + "reward_std": 0.323777437210083, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999127388000488, + "sampling/importance_sampling_ratio/min": 0.00014012664905749261, + "sampling/sampling_logp_difference/max": 8.872963905334473, + "sampling/sampling_logp_difference/mean": 0.016914553940296173, + "step": 421 + }, + { + "clip_ratio/high_max": 2.1269573153404053e-05, + "clip_ratio/high_mean": 5.948400371380558e-06, + "clip_ratio/low_mean": 2.3538930747690756e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9487331687505502e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16018.0, + "completions/max_terminated_length": 16018.0, + "completions/mean_length": 7702.3046875, + "completions/mean_terminated_length": 7702.3046875, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.9053447172045708, + "epoch": 0.38822447102115915, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004324545152485371, + "learning_rate": 1e-05, + "loss": 0.0149, + "num_tokens": 371162773.0, + "reward": 0.2421875, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00001060962677, + "sampling/importance_sampling_ratio/min": 2.283278627146501e-05, + "sampling/sampling_logp_difference/max": 10.687313079833984, + "sampling/sampling_logp_difference/mean": 0.020495830103754997, + "step": 422 + }, + { + "clip_ratio/high_max": 1.0294916819475475e-05, + "clip_ratio/high_mean": 2.5737292048688687e-06, + "clip_ratio/low_mean": 5.831611520079605e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.088984559937671e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15295.0, + "completions/mean_length": 6904.78125, + "completions/mean_terminated_length": 6754.31787109375, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.7991176024079323, + "epoch": 0.3891444342226311, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003239463549107313, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 372067241.0, + "reward": 0.328125, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.00012340991816017777, + "sampling/sampling_logp_difference/max": 8.999999046325684, + "sampling/sampling_logp_difference/mean": 0.019042208790779114, + "step": 423 + }, + { + "clip_ratio/high_max": 2.7261318791715894e-05, + "clip_ratio/high_mean": 7.926559305815317e-06, + "clip_ratio/low_mean": 1.552133551285806e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3447895273420727e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15399.0, + "completions/mean_length": 6107.7421875, + "completions/mean_terminated_length": 5602.35205078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.9495253190398216, + "epoch": 0.39006439742410304, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015464330790564418, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 372866072.0, + "reward": 0.421875, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971330165863, + "sampling/importance_sampling_ratio/min": 0.00024684349773451686, + "sampling/sampling_logp_difference/max": 8.306756019592285, + "sampling/sampling_logp_difference/mean": 0.019793221727013588, + "step": 424 + }, + { + "clip_ratio/high_max": 2.457227401464479e-05, + "clip_ratio/high_mean": 8.533324717063806e-06, + "clip_ratio/low_mean": 3.261690835643094e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.115023284612107e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15939.0, + "completions/mean_length": 6079.8046875, + "completions/mean_terminated_length": 5747.4111328125, + "completions/min_length": 1082.0, + "completions/min_terminated_length": 1082.0, + "entropy": 0.8005363270640373, + "epoch": 0.39098436062557496, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024811832699924707, + "learning_rate": 1e-05, + "loss": 0.1124, + "num_tokens": 373663463.0, + "reward": 0.625, + "reward_std": 0.2630355656147003, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743103981018, + "sampling/importance_sampling_ratio/min": 0.00019348970090504736, + "sampling/sampling_logp_difference/max": 8.550286293029785, + "sampling/sampling_logp_difference/mean": 0.017151469364762306, + "step": 425 + }, + { + "clip_ratio/high_max": 3.3719989005476236e-06, + "clip_ratio/high_mean": 8.429997251369059e-07, + "clip_ratio/low_mean": 2.132218082806503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2165180553201935e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14925.0, + "completions/mean_length": 6453.7890625, + "completions/mean_terminated_length": 6375.5986328125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "entropy": 0.9212624430656433, + "epoch": 0.39190432382704693, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031475063879042864, + "learning_rate": 1e-05, + "loss": 0.0959, + "num_tokens": 374517492.0, + "reward": 0.34375, + "reward_std": 0.19910329580307007, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999594688415527, + "sampling/importance_sampling_ratio/min": 0.015664709731936455, + "sampling/sampling_logp_difference/max": 4.156344890594482, + "sampling/sampling_logp_difference/mean": 0.019899867475032806, + "step": 426 + }, + { + "clip_ratio/high_max": 1.907509408738406e-05, + "clip_ratio/high_mean": 5.984868664654641e-06, + "clip_ratio/low_mean": 3.784128080042137e-05, + "clip_ratio/low_min": 3.7751804029539926e-06, + "clip_ratio/region_mean": 4.382614952191943e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16159.0, + "completions/max_terminated_length": 16159.0, + "completions/mean_length": 6126.9921875, + "completions/mean_terminated_length": 6126.9921875, + "completions/min_length": 1106.0, + "completions/min_terminated_length": 1106.0, + "entropy": 0.8252849578857422, + "epoch": 0.39282428702851885, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004200868774205446, + "learning_rate": 1e-05, + "loss": 0.0276, + "num_tokens": 375320339.0, + "reward": 0.4140625, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999815225601196, + "sampling/importance_sampling_ratio/min": 0.005763276945799589, + "sampling/sampling_logp_difference/max": 5.156249046325684, + "sampling/sampling_logp_difference/mean": 0.01833093911409378, + "step": 427 + }, + { + "clip_ratio/high_max": 1.8918785372079583e-05, + "clip_ratio/high_mean": 5.476571459439583e-06, + "clip_ratio/low_mean": 6.169724406390742e-05, + "clip_ratio/low_min": 7.494657666029525e-06, + "clip_ratio/region_mean": 6.717381506859965e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15411.0, + "completions/mean_length": 6739.09375, + "completions/mean_terminated_length": 6427.9677734375, + "completions/min_length": 1228.0, + "completions/min_terminated_length": 1228.0, + "entropy": 0.8008574098348618, + "epoch": 0.3937442502299908, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003204014617949724, + "learning_rate": 1e-05, + "loss": 0.0481, + "num_tokens": 376201015.0, + "reward": 0.5390625, + "reward_std": 0.37086254358291626, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998303651809692, + "sampling/importance_sampling_ratio/min": 0.00010144581028725952, + "sampling/sampling_logp_difference/max": 9.195985794067383, + "sampling/sampling_logp_difference/mean": 0.018961725756525993, + "step": 428 + }, + { + "clip_ratio/high_max": 1.3558789078160771e-05, + "clip_ratio/high_mean": 3.389697269540193e-06, + "clip_ratio/low_mean": 5.3925050679026754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.731474743697618e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15634.0, + "completions/mean_length": 7245.8984375, + "completions/mean_terminated_length": 6951.12060546875, + "completions/min_length": 1306.0, + "completions/min_terminated_length": 1306.0, + "entropy": 1.0351596996188164, + "epoch": 0.39466421343146274, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0039763906970620155, + "learning_rate": 1e-05, + "loss": 0.0299, + "num_tokens": 377149650.0, + "reward": 0.375, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000600814819336, + "sampling/importance_sampling_ratio/min": 8.106228051474318e-05, + "sampling/sampling_logp_difference/max": 9.420292854309082, + "sampling/sampling_logp_difference/mean": 0.020948028191924095, + "step": 429 + }, + { + "clip_ratio/high_max": 1.4580486549675697e-05, + "clip_ratio/high_mean": 4.259903903403028e-06, + "clip_ratio/low_mean": 4.6149686397711775e-05, + "clip_ratio/low_min": 3.006686938533676e-06, + "clip_ratio/region_mean": 5.04095905853319e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15608.0, + "completions/mean_length": 6958.625, + "completions/mean_terminated_length": 6495.08154296875, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.8360240310430527, + "epoch": 0.39558417663293466, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0031417158897966146, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 378057802.0, + "reward": 0.515625, + "reward_std": 0.35771697759628296, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999384880065918, + "sampling/importance_sampling_ratio/min": 0.00010235882655251771, + "sampling/sampling_logp_difference/max": 9.187026023864746, + "sampling/sampling_logp_difference/mean": 0.019185224547982216, + "step": 430 + }, + { + "clip_ratio/high_max": 6.681633749394678e-06, + "clip_ratio/high_mean": 1.6704084373486694e-06, + "clip_ratio/low_mean": 5.096616632727091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.263657521936693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15410.0, + "completions/max_terminated_length": 15410.0, + "completions/mean_length": 5696.3984375, + "completions/mean_terminated_length": 5696.3984375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.7887749597430229, + "epoch": 0.39650413983440663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004943124484270811, + "learning_rate": 1e-05, + "loss": 0.096, + "num_tokens": 378808021.0, + "reward": 0.515625, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999057054519653, + "sampling/importance_sampling_ratio/min": 0.0015042300801724195, + "sampling/sampling_logp_difference/max": 6.499474048614502, + "sampling/sampling_logp_difference/mean": 0.018845941871404648, + "step": 431 + }, + { + "clip_ratio/high_max": 1.7526824194646906e-05, + "clip_ratio/high_mean": 5.417880970526312e-06, + "clip_ratio/low_mean": 3.513921649300755e-05, + "clip_ratio/low_min": 6.075038982089609e-06, + "clip_ratio/region_mean": 4.0557096895099676e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14233.0, + "completions/mean_length": 6480.8828125, + "completions/mean_terminated_length": 6323.69091796875, + "completions/min_length": 1013.0, + "completions/min_terminated_length": 1013.0, + "entropy": 0.8796411231160164, + "epoch": 0.39742410303587855, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00595651101320982, + "learning_rate": 1e-05, + "loss": 0.0546, + "num_tokens": 379659710.0, + "reward": 0.3984375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 0.0017907419241964817, + "sampling/sampling_logp_difference/max": 6.325125217437744, + "sampling/sampling_logp_difference/mean": 0.01906527951359749, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4512424602107785e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4512424602107785e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7501.703125, + "completions/mean_terminated_length": 6829.93310546875, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "entropy": 0.786028303205967, + "epoch": 0.3983440662373505, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0024527597706764936, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 380640720.0, + "reward": 0.5234375, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999595880508423, + "sampling/importance_sampling_ratio/min": 8.851602615322918e-07, + "sampling/sampling_logp_difference/max": 13.93749713897705, + "sampling/sampling_logp_difference/mean": 0.01873261108994484, + "step": 433 + }, + { + "clip_ratio/high_max": 1.4606259583160863e-05, + "clip_ratio/high_mean": 5.505394312876888e-06, + "clip_ratio/low_mean": 3.1679782978244475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7185177234277944e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15185.0, + "completions/mean_length": 5619.2890625, + "completions/mean_terminated_length": 5448.4208984375, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.8098893761634827, + "epoch": 0.39926402943882244, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004280989523977041, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 381377981.0, + "reward": 0.609375, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.0010248658945783973, + "sampling/sampling_logp_difference/max": 6.883193492889404, + "sampling/sampling_logp_difference/mean": 0.017923470586538315, + "step": 434 + }, + { + "clip_ratio/high_max": 1.4808703554081148e-05, + "clip_ratio/high_mean": 3.702175888520287e-06, + "clip_ratio/low_mean": 2.3637440563106793e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7339616224253405e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5243.8203125, + "completions/mean_terminated_length": 5156.1025390625, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "entropy": 0.7485036551952362, + "epoch": 0.40018399264029436, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004721642471849918, + "learning_rate": 1e-05, + "loss": 0.0877, + "num_tokens": 382070478.0, + "reward": 0.6875, + "reward_std": 0.26538965106010437, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999414086341858, + "sampling/importance_sampling_ratio/min": 0.0011518355458974838, + "sampling/sampling_logp_difference/max": 6.7663984298706055, + "sampling/sampling_logp_difference/mean": 0.016579966992139816, + "step": 435 + }, + { + "clip_ratio/high_max": 3.1177480195765384e-05, + "clip_ratio/high_mean": 1.1174359769938746e-05, + "clip_ratio/low_mean": 3.602651599976525e-05, + "clip_ratio/low_min": 4.348733455117326e-06, + "clip_ratio/region_mean": 4.720087713394605e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15978.0, + "completions/mean_length": 7021.1796875, + "completions/mean_terminated_length": 6872.56396484375, + "completions/min_length": 1371.0, + "completions/min_terminated_length": 1371.0, + "entropy": 0.8693460151553154, + "epoch": 0.40110395584176634, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00329192029312253, + "learning_rate": 1e-05, + "loss": 0.0342, + "num_tokens": 382990245.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999822378158569, + "sampling/importance_sampling_ratio/min": 0.0023386883549392223, + "sampling/sampling_logp_difference/max": 6.058165073394775, + "sampling/sampling_logp_difference/mean": 0.019863136112689972, + "step": 436 + }, + { + "clip_ratio/high_max": 1.1192694955752813e-05, + "clip_ratio/high_mean": 2.7981737389382033e-06, + "clip_ratio/low_mean": 4.9078003257818636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.1876177280973934e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15344.0, + "completions/mean_length": 6917.625, + "completions/mean_terminated_length": 6452.0654296875, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "entropy": 0.8466897681355476, + "epoch": 0.40202391904323825, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0051889242604374886, + "learning_rate": 1e-05, + "loss": 0.1009, + "num_tokens": 383896717.0, + "reward": 0.4140625, + "reward_std": 0.3448137044906616, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999983310699463, + "sampling/importance_sampling_ratio/min": 0.00015846389578655362, + "sampling/sampling_logp_difference/max": 8.749983787536621, + "sampling/sampling_logp_difference/mean": 0.019528398290276527, + "step": 437 + }, + { + "clip_ratio/high_max": 2.3224948108691024e-05, + "clip_ratio/high_mean": 8.263948757303297e-06, + "clip_ratio/low_mean": 3.8556312347282073e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.682026019509067e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16175.0, + "completions/mean_length": 7487.5078125, + "completions/mean_terminated_length": 7346.2939453125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.9584660083055496, + "epoch": 0.4029438822447102, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002855573548004031, + "learning_rate": 1e-05, + "loss": 0.0087, + "num_tokens": 384872622.0, + "reward": 0.3828125, + "reward_std": 0.2477683424949646, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999386668205261, + "sampling/importance_sampling_ratio/min": 0.0038593418430536985, + "sampling/sampling_logp_difference/max": 5.557258605957031, + "sampling/sampling_logp_difference/mean": 0.0209865253418684, + "step": 438 + }, + { + "clip_ratio/high_max": 6.171620498207631e-06, + "clip_ratio/high_mean": 1.5429051245519076e-06, + "clip_ratio/low_mean": 2.98128834401723e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.135578845103737e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16092.0, + "completions/mean_length": 6637.5078125, + "completions/mean_terminated_length": 6323.1044921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 0.8841215297579765, + "epoch": 0.40386384544618215, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004437311552464962, + "learning_rate": 1e-05, + "loss": 0.0523, + "num_tokens": 385744023.0, + "reward": 0.3984375, + "reward_std": 0.2603819966316223, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999136924743652, + "sampling/importance_sampling_ratio/min": 0.002925124252215028, + "sampling/sampling_logp_difference/max": 5.834418296813965, + "sampling/sampling_logp_difference/mean": 0.019490888342261314, + "step": 439 + }, + { + "clip_ratio/high_max": 1.3304874300956726e-05, + "clip_ratio/high_mean": 3.3262185752391815e-06, + "clip_ratio/low_mean": 5.443932013804442e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.776553894065728e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15143.0, + "completions/mean_length": 5965.9765625, + "completions/mean_terminated_length": 5800.611328125, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "entropy": 0.8726934269070625, + "epoch": 0.4047838086476541, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002463799435645342, + "learning_rate": 1e-05, + "loss": -0.0075, + "num_tokens": 386525492.0, + "reward": 0.3984375, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.00020367901015561074, + "sampling/sampling_logp_difference/max": 8.4989652633667, + "sampling/sampling_logp_difference/mean": 0.01946769654750824, + "step": 440 + }, + { + "clip_ratio/high_max": 1.0084711902891286e-05, + "clip_ratio/high_mean": 3.6154040117253317e-06, + "clip_ratio/low_mean": 3.598771945689805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9603123695997056e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6693.109375, + "completions/mean_terminated_length": 6616.80322265625, + "completions/min_length": 1704.0, + "completions/min_terminated_length": 1704.0, + "entropy": 0.9430640190839767, + "epoch": 0.40570377184912604, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038990566972643137, + "learning_rate": 1e-05, + "loss": 0.0415, + "num_tokens": 387404842.0, + "reward": 0.421875, + "reward_std": 0.31587693095207214, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999700784683228, + "sampling/importance_sampling_ratio/min": 0.0011708902893587947, + "sampling/sampling_logp_difference/max": 6.749990940093994, + "sampling/sampling_logp_difference/mean": 0.020848294720053673, + "step": 441 + }, + { + "clip_ratio/high_max": 7.462686426151777e-06, + "clip_ratio/high_mean": 1.8656716065379442e-06, + "clip_ratio/low_mean": 5.234285907818048e-05, + "clip_ratio/low_min": 4.47803950009984e-06, + "clip_ratio/region_mean": 5.420853057103159e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7045.6953125, + "completions/mean_terminated_length": 6505.46240234375, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "entropy": 0.8912066072225571, + "epoch": 0.40662373505059796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018510994268581271, + "learning_rate": 1e-05, + "loss": 0.099, + "num_tokens": 388324475.0, + "reward": 0.40625, + "reward_std": 0.32195523381233215, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999024868011475, + "sampling/importance_sampling_ratio/min": 0.0031757301185280085, + "sampling/sampling_logp_difference/max": 5.752217769622803, + "sampling/sampling_logp_difference/mean": 0.020547039806842804, + "step": 442 + }, + { + "clip_ratio/high_max": 2.504527083146968e-05, + "clip_ratio/high_mean": 6.26131770786742e-06, + "clip_ratio/low_mean": 6.165269871871715e-05, + "clip_ratio/low_min": 3.5272871627967106e-06, + "clip_ratio/region_mean": 6.791401551708987e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15734.0, + "completions/mean_length": 7480.0078125, + "completions/mean_terminated_length": 7266.3125, + "completions/min_length": 1130.0, + "completions/min_terminated_length": 1130.0, + "entropy": 0.8813760280609131, + "epoch": 0.40754369825206993, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004439481534063816, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 389305644.0, + "reward": 0.34375, + "reward_std": 0.31300368905067444, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999762773513794, + "sampling/importance_sampling_ratio/min": 0.007449973840266466, + "sampling/sampling_logp_difference/max": 4.899544715881348, + "sampling/sampling_logp_difference/mean": 0.01973455585539341, + "step": 443 + }, + { + "clip_ratio/high_max": 4.0980917219712865e-06, + "clip_ratio/high_mean": 1.0245229304928216e-06, + "clip_ratio/low_mean": 3.662567087303614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.76501939172158e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15302.0, + "completions/max_terminated_length": 15302.0, + "completions/mean_length": 7044.4453125, + "completions/mean_terminated_length": 7044.4453125, + "completions/min_length": 1229.0, + "completions/min_terminated_length": 1229.0, + "entropy": 0.9901906549930573, + "epoch": 0.40846366145354185, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004181519150733948, + "learning_rate": 1e-05, + "loss": -0.0068, + "num_tokens": 390229373.0, + "reward": 0.421875, + "reward_std": 0.17700131237506866, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000314712524414, + "sampling/importance_sampling_ratio/min": 0.00022536676260642707, + "sampling/sampling_logp_difference/max": 8.397781372070312, + "sampling/sampling_logp_difference/mean": 0.021211043000221252, + "step": 444 + }, + { + "clip_ratio/high_max": 1.4909872106727562e-05, + "clip_ratio/high_mean": 3.7274680266818905e-06, + "clip_ratio/low_mean": 5.29995777469594e-05, + "clip_ratio/low_min": 3.708758640641463e-06, + "clip_ratio/region_mean": 5.672704537573736e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7815.8125, + "completions/mean_terminated_length": 7244.6005859375, + "completions/min_length": 1350.0, + "completions/min_terminated_length": 1350.0, + "entropy": 0.8278292864561081, + "epoch": 0.4093836246550138, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002691390924155712, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 391251141.0, + "reward": 0.3515625, + "reward_std": 0.31222954392433167, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 0.007715471088886261, + "sampling/sampling_logp_difference/max": 4.864527702331543, + "sampling/sampling_logp_difference/mean": 0.018415704369544983, + "step": 445 + }, + { + "clip_ratio/high_max": 2.1858722902834415e-05, + "clip_ratio/high_mean": 6.629899417021079e-06, + "clip_ratio/low_mean": 3.196247394043894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.859237290271267e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15202.0, + "completions/mean_length": 5305.1796875, + "completions/mean_terminated_length": 5217.94482421875, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.8100772425532341, + "epoch": 0.41030358785648574, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0069543467834591866, + "learning_rate": 1e-05, + "loss": 0.1153, + "num_tokens": 391956196.0, + "reward": 0.609375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000190734863281, + "sampling/importance_sampling_ratio/min": 0.0024869756307452917, + "sampling/sampling_logp_difference/max": 5.996687889099121, + "sampling/sampling_logp_difference/mean": 0.017318082973361015, + "step": 446 + }, + { + "clip_ratio/high_max": 2.461934036546154e-05, + "clip_ratio/high_mean": 8.056288947955181e-06, + "clip_ratio/low_mean": 5.289376917971822e-05, + "clip_ratio/low_min": 4.21926688431995e-06, + "clip_ratio/region_mean": 6.0950058468733914e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15300.0, + "completions/mean_length": 7299.578125, + "completions/mean_terminated_length": 6930.29248046875, + "completions/min_length": 1008.0, + "completions/min_terminated_length": 1008.0, + "entropy": 0.9955824315547943, + "epoch": 0.41122355105795766, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0065611582249403, + "learning_rate": 1e-05, + "loss": 0.0883, + "num_tokens": 392908430.0, + "reward": 0.4375, + "reward_std": 0.304571270942688, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999696016311646, + "sampling/importance_sampling_ratio/min": 6.9738744059577584e-06, + "sampling/sampling_logp_difference/max": 11.873339653015137, + "sampling/sampling_logp_difference/mean": 0.02127375639975071, + "step": 447 + }, + { + "clip_ratio/high_max": 2.4339562514796853e-05, + "clip_ratio/high_mean": 7.412756531266496e-06, + "clip_ratio/low_mean": 3.89272447591793e-05, + "clip_ratio/low_min": 4.047796210215893e-06, + "clip_ratio/region_mean": 4.6340001517819474e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 6702.9375, + "completions/mean_terminated_length": 6390.64501953125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.82919991761446, + "epoch": 0.41214351425942963, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032975098583847284, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 393788286.0, + "reward": 0.4609375, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.00028582560480572283, + "sampling/sampling_logp_difference/max": 8.160128593444824, + "sampling/sampling_logp_difference/mean": 0.019461583346128464, + "step": 448 + }, + { + "clip_ratio/high_max": 2.3807599063729867e-05, + "clip_ratio/high_mean": 5.951899765932467e-06, + "clip_ratio/low_mean": 3.195798365140945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.790988330365508e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15244.0, + "completions/mean_length": 6468.9453125, + "completions/mean_terminated_length": 5536.7607421875, + "completions/min_length": 808.0, + "completions/min_terminated_length": 808.0, + "entropy": 0.6471721827983856, + "epoch": 0.41306347746090155, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032787907402962446, + "learning_rate": 1e-05, + "loss": 0.1149, + "num_tokens": 394638159.0, + "reward": 0.625, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 0.00012341380352154374, + "sampling/sampling_logp_difference/max": 8.999967575073242, + "sampling/sampling_logp_difference/mean": 0.016151495277881622, + "step": 449 + }, + { + "clip_ratio/high_max": 2.247072688987828e-05, + "clip_ratio/high_mean": 5.61768172246957e-06, + "clip_ratio/low_mean": 6.035319393049576e-05, + "clip_ratio/low_min": 4.063190772285452e-06, + "clip_ratio/region_mean": 6.597087667614687e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15931.0, + "completions/mean_length": 6547.3203125, + "completions/mean_terminated_length": 6230.0078125, + "completions/min_length": 587.0, + "completions/min_terminated_length": 587.0, + "entropy": 0.9123960956931114, + "epoch": 0.4139834406623735, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038375966250896454, + "learning_rate": 1e-05, + "loss": 0.0967, + "num_tokens": 395493872.0, + "reward": 0.4296875, + "reward_std": 0.30798619985580444, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999747276306152, + "sampling/importance_sampling_ratio/min": 0.00016009423416107893, + "sampling/sampling_logp_difference/max": 8.739748001098633, + "sampling/sampling_logp_difference/mean": 0.019957344979047775, + "step": 450 + }, + { + "clip_ratio/high_max": 1.404482372890925e-05, + "clip_ratio/high_mean": 3.5112059322273126e-06, + "clip_ratio/low_mean": 2.315102483407827e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6662230766305584e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15058.0, + "completions/mean_length": 6291.859375, + "completions/mean_terminated_length": 6131.6669921875, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 0.9841655194759369, + "epoch": 0.41490340386384544, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003903903067111969, + "learning_rate": 1e-05, + "loss": 0.0656, + "num_tokens": 396320254.0, + "reward": 0.4296875, + "reward_std": 0.2569621503353119, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452829360962, + "sampling/importance_sampling_ratio/min": 6.564632712979801e-06, + "sampling/sampling_logp_difference/max": 11.93381404876709, + "sampling/sampling_logp_difference/mean": 0.020753150805830956, + "step": 451 + }, + { + "clip_ratio/high_max": 1.5189204987109406e-05, + "clip_ratio/high_mean": 4.615214265868417e-06, + "clip_ratio/low_mean": 3.547988831087423e-05, + "clip_ratio/low_min": 3.3967392027989263e-06, + "clip_ratio/region_mean": 4.009510257674265e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15966.0, + "completions/mean_length": 7692.4296875, + "completions/mean_terminated_length": 7339.11376953125, + "completions/min_length": 1269.0, + "completions/min_terminated_length": 1269.0, + "entropy": 0.94080401211977, + "epoch": 0.41582336706531736, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005152889993041754, + "learning_rate": 1e-05, + "loss": 0.0511, + "num_tokens": 397327029.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 5.027571751270443e-05, + "sampling/sampling_logp_difference/max": 9.897988319396973, + "sampling/sampling_logp_difference/mean": 0.02036213129758835, + "step": 452 + }, + { + "clip_ratio/high_max": 1.733157705530175e-05, + "clip_ratio/high_mean": 6.0586507970583625e-06, + "clip_ratio/low_mean": 2.335082047011383e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9409470812424843e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15305.0, + "completions/mean_length": 6968.0859375, + "completions/mean_terminated_length": 6742.1044921875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "entropy": 0.9254838973283768, + "epoch": 0.41674333026678934, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0035838852636516094, + "learning_rate": 1e-05, + "loss": 0.0182, + "num_tokens": 398237536.0, + "reward": 0.484375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.002404628787189722, + "sampling/sampling_logp_difference/max": 6.030359745025635, + "sampling/sampling_logp_difference/mean": 0.020200733095407486, + "step": 453 + }, + { + "clip_ratio/high_max": 4.464923677005572e-06, + "clip_ratio/high_mean": 1.116230919251393e-06, + "clip_ratio/low_mean": 3.311113533754906e-05, + "clip_ratio/low_min": 6.725854291289579e-06, + "clip_ratio/region_mean": 3.422736637048729e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16309.0, + "completions/mean_length": 8711.078125, + "completions/mean_terminated_length": 8199.55078125, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "entropy": 0.8735406622290611, + "epoch": 0.41766329346826125, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0036290446296334267, + "learning_rate": 1e-05, + "loss": 0.0412, + "num_tokens": 399373298.0, + "reward": 0.359375, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000042200088501, + "sampling/importance_sampling_ratio/min": 9.216561011271551e-05, + "sampling/sampling_logp_difference/max": 9.291923522949219, + "sampling/sampling_logp_difference/mean": 0.0201371181756258, + "step": 454 + }, + { + "clip_ratio/high_max": 3.4702664606811595e-05, + "clip_ratio/high_mean": 8.675666151702899e-06, + "clip_ratio/low_mean": 3.3217100849469716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.189276808119757e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14737.0, + "completions/mean_length": 6891.078125, + "completions/mean_terminated_length": 6663.24853515625, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "entropy": 0.8689641878008842, + "epoch": 0.41858325666973323, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004067540634423494, + "learning_rate": 1e-05, + "loss": 0.0633, + "num_tokens": 400273708.0, + "reward": 0.484375, + "reward_std": 0.27274850010871887, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999425411224365, + "sampling/importance_sampling_ratio/min": 4.0002717582865444e-07, + "sampling/sampling_logp_difference/max": 14.731733322143555, + "sampling/sampling_logp_difference/mean": 0.019800148904323578, + "step": 455 + }, + { + "clip_ratio/high_max": 2.939170826721238e-06, + "clip_ratio/high_mean": 7.347927066803095e-07, + "clip_ratio/low_mean": 3.564125790944672e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6376050502440194e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15234.0, + "completions/mean_length": 6899.3515625, + "completions/mean_terminated_length": 6748.8017578125, + "completions/min_length": 1149.0, + "completions/min_terminated_length": 1149.0, + "entropy": 0.9442604705691338, + "epoch": 0.41950321987120515, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0026191689539700747, + "learning_rate": 1e-05, + "loss": 0.0743, + "num_tokens": 401177497.0, + "reward": 0.46875, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 0.0017910725437104702, + "sampling/sampling_logp_difference/max": 6.3249406814575195, + "sampling/sampling_logp_difference/mean": 0.021380646154284477, + "step": 456 + }, + { + "clip_ratio/high_max": 8.99604128790088e-06, + "clip_ratio/high_mean": 2.24901032197522e-06, + "clip_ratio/low_mean": 2.57235833487357e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.797259367071092e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16226.0, + "completions/mean_length": 7175.8359375, + "completions/mean_terminated_length": 7029.6748046875, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.8653769046068192, + "epoch": 0.4204231830726771, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003141516586765647, + "learning_rate": 1e-05, + "loss": 0.0674, + "num_tokens": 402115812.0, + "reward": 0.4375, + "reward_std": 0.21040895581245422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999862909317017, + "sampling/importance_sampling_ratio/min": 0.001265019178390503, + "sampling/sampling_logp_difference/max": 6.672667980194092, + "sampling/sampling_logp_difference/mean": 0.01970163732767105, + "step": 457 + }, + { + "clip_ratio/high_max": 1.0800059499160852e-05, + "clip_ratio/high_mean": 2.700014874790213e-06, + "clip_ratio/low_mean": 3.116219727417047e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3862211807900167e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16250.0, + "completions/mean_length": 7090.8515625, + "completions/mean_terminated_length": 6791.072265625, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.9437825232744217, + "epoch": 0.42134314627414904, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001980370609089732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 403048385.0, + "reward": 0.4609375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 1.4011449138706666e-06, + "sampling/sampling_logp_difference/max": 13.47822093963623, + "sampling/sampling_logp_difference/mean": 0.021090596914291382, + "step": 458 + }, + { + "clip_ratio/high_max": 2.5482850560365478e-05, + "clip_ratio/high_mean": 6.370712640091369e-06, + "clip_ratio/low_mean": 4.8558076969129615e-05, + "clip_ratio/low_min": 4.8952420002024155e-06, + "clip_ratio/region_mean": 5.4928788131292094e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16175.0, + "completions/mean_length": 7033.65625, + "completions/mean_terminated_length": 6809.24853515625, + "completions/min_length": 1007.0, + "completions/min_terminated_length": 1007.0, + "entropy": 0.8789731040596962, + "epoch": 0.42226310947562096, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003833206370472908, + "learning_rate": 1e-05, + "loss": 0.059, + "num_tokens": 403968037.0, + "reward": 0.46875, + "reward_std": 0.28460076451301575, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000317096710205, + "sampling/importance_sampling_ratio/min": 0.0021942879538983107, + "sampling/sampling_logp_difference/max": 6.1218976974487305, + "sampling/sampling_logp_difference/mean": 0.019913772121071815, + "step": 459 + }, + { + "clip_ratio/high_max": 4.068877842655638e-06, + "clip_ratio/high_mean": 1.0172194606639096e-06, + "clip_ratio/low_mean": 6.774969961043098e-05, + "clip_ratio/low_min": 3.189914878021227e-06, + "clip_ratio/region_mean": 6.876691895740805e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 6992.8984375, + "completions/mean_terminated_length": 6611.14599609375, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "entropy": 0.857115626335144, + "epoch": 0.42318307267709293, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005315023008733988, + "learning_rate": 1e-05, + "loss": 0.1581, + "num_tokens": 404881584.0, + "reward": 0.3515625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000758171081543, + "sampling/importance_sampling_ratio/min": 4.546630952972919e-05, + "sampling/sampling_logp_difference/max": 9.998538970947266, + "sampling/sampling_logp_difference/mean": 0.01872519962489605, + "step": 460 + }, + { + "clip_ratio/high_max": 1.167047457784065e-05, + "clip_ratio/high_mean": 2.9176186444601626e-06, + "clip_ratio/low_mean": 3.3195502112448594e-05, + "clip_ratio/low_min": 5.25188033861923e-06, + "clip_ratio/region_mean": 3.611312064322192e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16095.0, + "completions/mean_length": 6623.2578125, + "completions/mean_terminated_length": 6226.4794921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.8803941905498505, + "epoch": 0.42410303587856485, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0074885934591293335, + "learning_rate": 1e-05, + "loss": 0.1076, + "num_tokens": 405749105.0, + "reward": 0.515625, + "reward_std": 0.25354722142219543, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999799728393555, + "sampling/importance_sampling_ratio/min": 0.0011723897187039256, + "sampling/sampling_logp_difference/max": 6.748711109161377, + "sampling/sampling_logp_difference/mean": 0.01930626854300499, + "step": 461 + }, + { + "clip_ratio/high_max": 4.11753080697963e-06, + "clip_ratio/high_mean": 1.0293827017449075e-06, + "clip_ratio/low_mean": 5.09268712676203e-05, + "clip_ratio/low_min": 1.1170248626513057e-05, + "clip_ratio/region_mean": 5.195625465148623e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15032.0, + "completions/mean_length": 7244.8203125, + "completions/mean_terminated_length": 6647.5419921875, + "completions/min_length": 1227.0, + "completions/min_terminated_length": 1227.0, + "entropy": 0.9202689751982689, + "epoch": 0.4250229990800368, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003960717935115099, + "learning_rate": 1e-05, + "loss": 0.0536, + "num_tokens": 406704618.0, + "reward": 0.484375, + "reward_std": 0.2880108058452606, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 1.69715603988152e-05, + "sampling/sampling_logp_difference/max": 10.98397159576416, + "sampling/sampling_logp_difference/mean": 0.02019711770117283, + "step": 462 + }, + { + "clip_ratio/high_max": 2.874629831239872e-05, + "clip_ratio/high_mean": 1.0519701334033016e-05, + "clip_ratio/low_mean": 5.367962035052187e-05, + "clip_ratio/low_min": 6.5083827394119e-06, + "clip_ratio/region_mean": 6.419932219614566e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 7462.0546875, + "completions/mean_terminated_length": 6867.2587890625, + "completions/min_length": 669.0, + "completions/min_terminated_length": 669.0, + "entropy": 0.8141553401947021, + "epoch": 0.42594296228150874, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003602087963372469, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 407677177.0, + "reward": 0.421875, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999440312385559, + "sampling/importance_sampling_ratio/min": 0.0007806668290868402, + "sampling/sampling_logp_difference/max": 7.155362129211426, + "sampling/sampling_logp_difference/mean": 0.01856713369488716, + "step": 463 + }, + { + "clip_ratio/high_max": 2.6413443720230134e-05, + "clip_ratio/high_mean": 8.973188073468918e-06, + "clip_ratio/low_mean": 3.5997712757307454e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.497090230870526e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15750.0, + "completions/mean_length": 6683.1796875, + "completions/mean_terminated_length": 6529.19873046875, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "entropy": 0.9070071652531624, + "epoch": 0.42686292548298066, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004038481041789055, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 408552512.0, + "reward": 0.4609375, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 4.474630986806005e-05, + "sampling/sampling_logp_difference/max": 10.014501571655273, + "sampling/sampling_logp_difference/mean": 0.02077356167137623, + "step": 464 + }, + { + "clip_ratio/high_max": 1.7171289982798044e-05, + "clip_ratio/high_mean": 4.292822495699511e-06, + "clip_ratio/low_mean": 3.225401701456576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654683996501262e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15864.0, + "completions/mean_length": 6472.9453125, + "completions/mean_terminated_length": 5985.51611328125, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.8807859197258949, + "epoch": 0.42778288868445263, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004457853268831968, + "learning_rate": 1e-05, + "loss": 0.0295, + "num_tokens": 409399257.0, + "reward": 0.421875, + "reward_std": 0.20517179369926453, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 0.0017577135004103184, + "sampling/sampling_logp_difference/max": 6.343741416931152, + "sampling/sampling_logp_difference/mean": 0.020475786179304123, + "step": 465 + }, + { + "clip_ratio/high_max": 5.442162637336878e-05, + "clip_ratio/high_mean": 1.584139977239829e-05, + "clip_ratio/low_mean": 5.706528349946893e-05, + "clip_ratio/low_min": 2.5156462925224332e-05, + "clip_ratio/region_mean": 7.290668463610928e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15896.0, + "completions/mean_length": 5989.78125, + "completions/mean_terminated_length": 5654.48388671875, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.8479711338877678, + "epoch": 0.42870285188592455, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033953245729207993, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 410185645.0, + "reward": 0.5, + "reward_std": 0.3735082745552063, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 1.781588616722729e-05, + "sampling/sampling_logp_difference/max": 10.935420036315918, + "sampling/sampling_logp_difference/mean": 0.017986344173550606, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.2673244681500364e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.2673244681500364e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16103.0, + "completions/mean_length": 8299.9453125, + "completions/mean_terminated_length": 8171.62744140625, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "entropy": 0.9363152608275414, + "epoch": 0.4296228150873965, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002381247701123357, + "learning_rate": 1e-05, + "loss": 0.0651, + "num_tokens": 411268974.0, + "reward": 0.2890625, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 0.000553094083443284, + "sampling/sampling_logp_difference/max": 7.4999823570251465, + "sampling/sampling_logp_difference/mean": 0.021354343742132187, + "step": 467 + }, + { + "clip_ratio/high_max": 8.578695997130126e-06, + "clip_ratio/high_mean": 2.1446739992825314e-06, + "clip_ratio/low_mean": 2.84454882830687e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.059016239603807e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14838.0, + "completions/mean_length": 7434.0546875, + "completions/mean_terminated_length": 7219.25634765625, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "entropy": 0.981913685798645, + "epoch": 0.43054277828886844, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.006341467145830393, + "learning_rate": 1e-05, + "loss": -0.003, + "num_tokens": 412238117.0, + "reward": 0.390625, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 0.0019304680172353983, + "sampling/sampling_logp_difference/max": 6.249992847442627, + "sampling/sampling_logp_difference/mean": 0.02139873616397381, + "step": 468 + }, + { + "clip_ratio/high_max": 1.7187987396027893e-05, + "clip_ratio/high_mean": 5.150076049176278e-06, + "clip_ratio/low_mean": 5.4699471832009294e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.9849548279089504e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15871.0, + "completions/mean_length": 7211.1796875, + "completions/mean_terminated_length": 7138.95263671875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "entropy": 0.9307222217321396, + "epoch": 0.43146274149034036, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002621602965518832, + "learning_rate": 1e-05, + "loss": 0.0562, + "num_tokens": 413182860.0, + "reward": 0.3203125, + "reward_std": 0.34716784954071045, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999529123306274, + "sampling/importance_sampling_ratio/min": 5.1446182624204084e-05, + "sampling/sampling_logp_difference/max": 9.874974250793457, + "sampling/sampling_logp_difference/mean": 0.020250719040632248, + "step": 469 + }, + { + "clip_ratio/high_max": 1.0867412584047997e-05, + "clip_ratio/high_mean": 3.9217885614561965e-06, + "clip_ratio/low_mean": 4.7740833792886406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.16626223543426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15726.0, + "completions/mean_length": 5349.4296875, + "completions/mean_terminated_length": 5174.2783203125, + "completions/min_length": 983.0, + "completions/min_terminated_length": 983.0, + "entropy": 1.0213474333286285, + "epoch": 0.43238270469181234, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035241330042481422, + "learning_rate": 1e-05, + "loss": 0.0657, + "num_tokens": 413885963.0, + "reward": 0.3046875, + "reward_std": 0.25330984592437744, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.0003569081309251487, + "sampling/sampling_logp_difference/max": 7.938032150268555, + "sampling/sampling_logp_difference/mean": 0.01975759118795395, + "step": 470 + }, + { + "clip_ratio/high_max": 1.469514609198086e-05, + "clip_ratio/high_mean": 3.673786522995215e-06, + "clip_ratio/low_mean": 2.699725871480041e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0671045237795624e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15357.0, + "completions/mean_length": 7542.8515625, + "completions/mean_terminated_length": 7257.65283203125, + "completions/min_length": 1359.0, + "completions/min_terminated_length": 1359.0, + "entropy": 0.8882969543337822, + "epoch": 0.43330266789328425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014164346503093839, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 414870560.0, + "reward": 0.3671875, + "reward_std": 0.20753081142902374, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000402927398682, + "sampling/importance_sampling_ratio/min": 6.435441900976002e-05, + "sampling/sampling_logp_difference/max": 9.651104927062988, + "sampling/sampling_logp_difference/mean": 0.020874422043561935, + "step": 471 + }, + { + "clip_ratio/high_max": 1.669827497607912e-05, + "clip_ratio/high_mean": 4.17456874401978e-06, + "clip_ratio/low_mean": 3.673103901746799e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.090560787517461e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7286.90625, + "completions/mean_terminated_length": 6993.451171875, + "completions/min_length": 977.0, + "completions/min_terminated_length": 977.0, + "entropy": 0.9254636988043785, + "epoch": 0.43422263109475623, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026956009678542614, + "learning_rate": 1e-05, + "loss": 0.0567, + "num_tokens": 415825252.0, + "reward": 0.328125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999917209148407, + "sampling/importance_sampling_ratio/min": 0.0019701423589140177, + "sampling/sampling_logp_difference/max": 6.229649543762207, + "sampling/sampling_logp_difference/mean": 0.0202642735093832, + "step": 472 + }, + { + "clip_ratio/high_max": 9.162045444099931e-06, + "clip_ratio/high_mean": 2.2905113610249828e-06, + "clip_ratio/low_mean": 3.818475033767754e-05, + "clip_ratio/low_min": 7.20606476534158e-06, + "clip_ratio/region_mean": 4.047526181238936e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15908.0, + "completions/mean_length": 7244.7421875, + "completions/mean_terminated_length": 6716.0244140625, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.7817923128604889, + "epoch": 0.43514259429622815, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022128887940198183, + "learning_rate": 1e-05, + "loss": 0.0577, + "num_tokens": 416774011.0, + "reward": 0.453125, + "reward_std": 0.2937847375869751, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.0015034435782581568, + "sampling/sampling_logp_difference/max": 6.499997138977051, + "sampling/sampling_logp_difference/mean": 0.01840684749186039, + "step": 473 + }, + { + "clip_ratio/high_max": 1.2232871313244686e-05, + "clip_ratio/high_mean": 3.0582178283111716e-06, + "clip_ratio/low_mean": 3.636896872194484e-05, + "clip_ratio/low_min": 3.1460788250115e-06, + "clip_ratio/region_mean": 3.9427186266038916e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16254.0, + "completions/mean_length": 9042.90625, + "completions/mean_terminated_length": 8283.482421875, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "entropy": 0.9306210279464722, + "epoch": 0.43606255749770007, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0034676652867347, + "learning_rate": 1e-05, + "loss": 0.0504, + "num_tokens": 417951311.0, + "reward": 0.265625, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999234080314636, + "sampling/importance_sampling_ratio/min": 0.0002641192404553294, + "sampling/sampling_logp_difference/max": 8.239109992980957, + "sampling/sampling_logp_difference/mean": 0.02112819254398346, + "step": 474 + }, + { + "clip_ratio/high_max": 2.5187824576278217e-05, + "clip_ratio/high_mean": 8.202394610634656e-06, + "clip_ratio/low_mean": 4.3606626604741905e-05, + "clip_ratio/low_min": 3.5752079838857753e-06, + "clip_ratio/region_mean": 5.1809020988002885e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15721.0, + "completions/mean_length": 6763.6328125, + "completions/mean_terminated_length": 6610.9287109375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9879302233457565, + "epoch": 0.43698252069917204, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030218157917261124, + "learning_rate": 1e-05, + "loss": 0.0704, + "num_tokens": 418836184.0, + "reward": 0.484375, + "reward_std": 0.30091896653175354, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999898672103882, + "sampling/importance_sampling_ratio/min": 0.0003778560785576701, + "sampling/sampling_logp_difference/max": 7.880997180938721, + "sampling/sampling_logp_difference/mean": 0.021101050078868866, + "step": 475 + }, + { + "clip_ratio/high_max": 1.0644185749697499e-05, + "clip_ratio/high_mean": 2.6610464374243747e-06, + "clip_ratio/low_mean": 6.21261324340594e-05, + "clip_ratio/low_min": 3.6509140954876784e-06, + "clip_ratio/region_mean": 6.478717887148377e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15675.0, + "completions/mean_length": 6794.25, + "completions/mean_terminated_length": 6564.09619140625, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 1.0259138569235802, + "epoch": 0.43790248390064396, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002881827764213085, + "learning_rate": 1e-05, + "loss": 0.0592, + "num_tokens": 419726192.0, + "reward": 0.265625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999275207519531, + "sampling/importance_sampling_ratio/min": 9.217044407705544e-07, + "sampling/sampling_logp_difference/max": 13.897041320800781, + "sampling/sampling_logp_difference/mean": 0.0210823193192482, + "step": 476 + }, + { + "clip_ratio/high_max": 1.108860487875063e-05, + "clip_ratio/high_mean": 2.7721512196876574e-06, + "clip_ratio/low_mean": 4.70996876629215e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9871839337356505e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14281.0, + "completions/max_terminated_length": 14281.0, + "completions/mean_length": 5648.2109375, + "completions/mean_terminated_length": 5648.2109375, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.88894472271204, + "epoch": 0.43882244710211593, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00289533962495625, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 420468867.0, + "reward": 0.484375, + "reward_std": 0.2675113081932068, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998449087142944, + "sampling/importance_sampling_ratio/min": 0.001372925122268498, + "sampling/sampling_logp_difference/max": 6.590811729431152, + "sampling/sampling_logp_difference/mean": 0.018499158322811127, + "step": 477 + }, + { + "clip_ratio/high_max": 4.753574557980755e-06, + "clip_ratio/high_mean": 1.1883936394951888e-06, + "clip_ratio/low_mean": 2.4103785335682915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5292179316238617e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15657.0, + "completions/mean_length": 6188.359375, + "completions/mean_terminated_length": 6026.52392578125, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "entropy": 0.8476063013076782, + "epoch": 0.43974241030358785, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.002749695209786296, + "learning_rate": 1e-05, + "loss": 0.0012, + "num_tokens": 421280881.0, + "reward": 0.3671875, + "reward_std": 0.15991678833961487, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796152114868, + "sampling/importance_sampling_ratio/min": 0.004578418098390102, + "sampling/sampling_logp_difference/max": 5.386401653289795, + "sampling/sampling_logp_difference/mean": 0.018456483259797096, + "step": 478 + }, + { + "clip_ratio/high_max": 4.1359915030625416e-05, + "clip_ratio/high_mean": 1.0339978757656354e-05, + "clip_ratio/low_mean": 4.786080125995795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8200780586048495e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15112.0, + "completions/mean_length": 6864.3515625, + "completions/mean_terminated_length": 6635.88037109375, + "completions/min_length": 1065.0, + "completions/min_terminated_length": 1065.0, + "entropy": 0.8666203916072845, + "epoch": 0.4406623735050598, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.005116373300552368, + "learning_rate": 1e-05, + "loss": 0.0347, + "num_tokens": 422177822.0, + "reward": 0.4453125, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 0.00020385721290949732, + "sampling/sampling_logp_difference/max": 8.498090744018555, + "sampling/sampling_logp_difference/mean": 0.01979806460440159, + "step": 479 + }, + { + "clip_ratio/high_max": 1.4544774558089557e-05, + "clip_ratio/high_mean": 3.6361936395223893e-06, + "clip_ratio/low_mean": 4.153812756158004e-05, + "clip_ratio/low_min": 3.606462769312202e-06, + "clip_ratio/region_mean": 4.51743208031985e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15841.0, + "completions/mean_length": 7023.828125, + "completions/mean_terminated_length": 6799.18408203125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9098334684967995, + "epoch": 0.44158233670653174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0020944855641573668, + "learning_rate": 1e-05, + "loss": 0.019, + "num_tokens": 423096576.0, + "reward": 0.2734375, + "reward_std": 0.20858672261238098, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999480247497559, + "sampling/importance_sampling_ratio/min": 0.0027383591514080763, + "sampling/sampling_logp_difference/max": 5.900396347045898, + "sampling/sampling_logp_difference/mean": 0.020111342892050743, + "step": 480 + }, + { + "clip_ratio/high_max": 3.256236095694476e-05, + "clip_ratio/high_mean": 1.2372795026749372e-05, + "clip_ratio/low_mean": 5.0774355258909054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.314715119515313e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15527.0, + "completions/mean_length": 6666.828125, + "completions/mean_terminated_length": 6512.587890625, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "entropy": 0.9162466824054718, + "epoch": 0.44250229990800366, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003897767048329115, + "learning_rate": 1e-05, + "loss": 0.1151, + "num_tokens": 423968050.0, + "reward": 0.46875, + "reward_std": 0.3527044653892517, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0031828521750867367, + "sampling/sampling_logp_difference/max": 5.7499775886535645, + "sampling/sampling_logp_difference/mean": 0.019923247396945953, + "step": 481 + }, + { + "clip_ratio/high_max": 1.5341902098953142e-05, + "clip_ratio/high_mean": 4.791600815678976e-06, + "clip_ratio/low_mean": 7.980174223121139e-05, + "clip_ratio/low_min": 2.6713308216130827e-05, + "clip_ratio/region_mean": 8.459334412691533e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16223.0, + "completions/mean_length": 7159.8046875, + "completions/mean_terminated_length": 7013.38916015625, + "completions/min_length": 1022.0, + "completions/min_terminated_length": 1022.0, + "entropy": 0.8444746807217598, + "epoch": 0.44342226310947563, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003038195427507162, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 424902953.0, + "reward": 0.359375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999940037727356, + "sampling/importance_sampling_ratio/min": 7.431909580191132e-06, + "sampling/sampling_logp_difference/max": 11.809727668762207, + "sampling/sampling_logp_difference/mean": 0.019014043733477592, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.55851120666739e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.55851120666739e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14716.0, + "completions/mean_length": 6146.2109375, + "completions/mean_terminated_length": 6065.5986328125, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.8365580290555954, + "epoch": 0.44434222631094755, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025550283025950193, + "learning_rate": 1e-05, + "loss": 0.0548, + "num_tokens": 425709212.0, + "reward": 0.5625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000015497207642, + "sampling/importance_sampling_ratio/min": 0.0006884043687023222, + "sampling/sampling_logp_difference/max": 7.281134128570557, + "sampling/sampling_logp_difference/mean": 0.019193854182958603, + "step": 483 + }, + { + "clip_ratio/high_max": 2.4752349872869672e-05, + "clip_ratio/high_mean": 7.036488455014478e-06, + "clip_ratio/low_mean": 4.780410063176532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.484058920046664e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16153.0, + "completions/mean_length": 6557.578125, + "completions/mean_terminated_length": 6321.744140625, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.8316832035779953, + "epoch": 0.4452621895124195, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005126865580677986, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 426566462.0, + "reward": 0.484375, + "reward_std": 0.27852246165275574, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 2.7536634661373682e-05, + "sampling/sampling_logp_difference/max": 10.499993324279785, + "sampling/sampling_logp_difference/mean": 0.01839536987245083, + "step": 484 + }, + { + "clip_ratio/high_max": 3.443571449679439e-05, + "clip_ratio/high_mean": 8.608928624198597e-06, + "clip_ratio/low_mean": 5.915772453590762e-05, + "clip_ratio/low_min": 1.7084812043322017e-05, + "clip_ratio/region_mean": 6.776665304641938e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16359.0, + "completions/mean_length": 7007.3203125, + "completions/mean_terminated_length": 6858.484375, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.8674142584204674, + "epoch": 0.44618215271389144, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004829525947570801, + "learning_rate": 1e-05, + "loss": 0.0753, + "num_tokens": 427480007.0, + "reward": 0.46875, + "reward_std": 0.3874102830886841, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998922944068909, + "sampling/importance_sampling_ratio/min": 0.00020170137577224523, + "sampling/sampling_logp_difference/max": 8.508722305297852, + "sampling/sampling_logp_difference/mean": 0.019586069509387016, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.539863354897534e-05, + "clip_ratio/low_min": 8.211341992137022e-06, + "clip_ratio/region_mean": 5.539863354897534e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14748.0, + "completions/mean_length": 7069.8828125, + "completions/mean_terminated_length": 6922.0400390625, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.9066255167126656, + "epoch": 0.44710211591536336, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003539952216669917, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 428404968.0, + "reward": 0.5, + "reward_std": 0.3618982434272766, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 0.00024052867956925184, + "sampling/sampling_logp_difference/max": 8.332671165466309, + "sampling/sampling_logp_difference/mean": 0.020427238196134567, + "step": 486 + }, + { + "clip_ratio/high_max": 1.6550495729461545e-05, + "clip_ratio/high_mean": 4.137623932365386e-06, + "clip_ratio/low_mean": 5.576918465521885e-05, + "clip_ratio/low_min": 1.2613936178240692e-05, + "clip_ratio/region_mean": 5.99068093833921e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15290.0, + "completions/max_terminated_length": 15290.0, + "completions/mean_length": 5586.6875, + "completions/mean_terminated_length": 5586.6875, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.9208655655384064, + "epoch": 0.44802207911683534, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0030504625756293535, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 429137176.0, + "reward": 0.515625, + "reward_std": 0.3480040729045868, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999984502792358, + "sampling/importance_sampling_ratio/min": 0.0005498559912666678, + "sampling/sampling_logp_difference/max": 7.50585412979126, + "sampling/sampling_logp_difference/mean": 0.019396595656871796, + "step": 487 + }, + { + "clip_ratio/high_max": 3.3761509712348925e-05, + "clip_ratio/high_mean": 8.440377428087231e-06, + "clip_ratio/low_mean": 3.6384140912559815e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.482451868170756e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15404.0, + "completions/mean_length": 5266.265625, + "completions/mean_terminated_length": 4999.4404296875, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.7884859293699265, + "epoch": 0.44894204231830726, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003902251599356532, + "learning_rate": 1e-05, + "loss": -0.0077, + "num_tokens": 429836026.0, + "reward": 0.5390625, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.05675617232918739, + "sampling/sampling_logp_difference/max": 2.868990898132324, + "sampling/sampling_logp_difference/mean": 0.01770034246146679, + "step": 488 + }, + { + "clip_ratio/high_max": 2.2323702978610527e-05, + "clip_ratio/high_mean": 5.580925744652632e-06, + "clip_ratio/low_mean": 4.0199149452746497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.578007497002545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 6398.53125, + "completions/mean_terminated_length": 6319.9052734375, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "entropy": 0.8982341960072517, + "epoch": 0.44986200551977923, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0024998660665005445, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 430673446.0, + "reward": 0.421875, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797940254211, + "sampling/importance_sampling_ratio/min": 0.000612784584518522, + "sampling/sampling_logp_difference/max": 7.397497177124023, + "sampling/sampling_logp_difference/mean": 0.020521972328424454, + "step": 489 + }, + { + "clip_ratio/high_max": 3.1756624366607866e-05, + "clip_ratio/high_mean": 7.939156091651967e-06, + "clip_ratio/low_mean": 8.124458963720826e-05, + "clip_ratio/low_min": 1.2379174222587608e-05, + "clip_ratio/region_mean": 8.91837471499457e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14374.0, + "completions/mean_length": 6277.65625, + "completions/mean_terminated_length": 6198.07861328125, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.8139145970344543, + "epoch": 0.45078196872125115, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00784115307033062, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 431497546.0, + "reward": 0.546875, + "reward_std": 0.37716054916381836, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999848484992981, + "sampling/importance_sampling_ratio/min": 0.0006267798598855734, + "sampling/sampling_logp_difference/max": 7.37491512298584, + "sampling/sampling_logp_difference/mean": 0.01836184598505497, + "step": 490 + }, + { + "clip_ratio/high_max": 8.875004823494237e-06, + "clip_ratio/high_mean": 2.2187512058735592e-06, + "clip_ratio/low_mean": 2.3825880248296016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6044631454169576e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15903.0, + "completions/mean_length": 7708.59375, + "completions/mean_terminated_length": 7355.9345703125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 1.087083138525486, + "epoch": 0.45170193192272307, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.004277343396097422, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 432503414.0, + "reward": 0.2890625, + "reward_std": 0.1633366346359253, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999503493309021, + "sampling/importance_sampling_ratio/min": 1.2187546417408157e-05, + "sampling/sampling_logp_difference/max": 11.315095901489258, + "sampling/sampling_logp_difference/mean": 0.02224145457148552, + "step": 491 + }, + { + "clip_ratio/high_max": 6.384065272868611e-06, + "clip_ratio/high_mean": 1.5960163182171527e-06, + "clip_ratio/low_mean": 3.561227788395627e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.720829374742607e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7162.7109375, + "completions/mean_terminated_length": 6865.25, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.9157010763883591, + "epoch": 0.45262189512419504, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.006278311368077993, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 433439137.0, + "reward": 0.5078125, + "reward_std": 0.2227931171655655, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966561794281, + "sampling/importance_sampling_ratio/min": 0.0005532125360332429, + "sampling/sampling_logp_difference/max": 7.499768257141113, + "sampling/sampling_logp_difference/mean": 0.02123419940471649, + "step": 492 + }, + { + "clip_ratio/high_max": 2.846911434062349e-05, + "clip_ratio/high_mean": 8.656040449750435e-06, + "clip_ratio/low_mean": 5.1716241614485625e-05, + "clip_ratio/low_min": 3.601579010137357e-06, + "clip_ratio/region_mean": 6.037228104105452e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16123.0, + "completions/mean_length": 7388.90625, + "completions/mean_terminated_length": 7023.251953125, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "entropy": 0.7670486867427826, + "epoch": 0.45354185832566696, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005177734419703484, + "learning_rate": 1e-05, + "loss": 0.0556, + "num_tokens": 434402045.0, + "reward": 0.3828125, + "reward_std": 0.37951958179473877, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999250769615173, + "sampling/importance_sampling_ratio/min": 0.0022511729039251804, + "sampling/sampling_logp_difference/max": 6.096303939819336, + "sampling/sampling_logp_difference/mean": 0.01827731542289257, + "step": 493 + }, + { + "clip_ratio/high_max": 2.1548471977439476e-05, + "clip_ratio/high_mean": 6.257203722270788e-06, + "clip_ratio/low_mean": 7.719641234871233e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.345361538886209e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15767.0, + "completions/mean_length": 6805.375, + "completions/mean_terminated_length": 6496.38671875, + "completions/min_length": 587.0, + "completions/min_terminated_length": 587.0, + "entropy": 0.8407405763864517, + "epoch": 0.45446182152713893, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032320048194378614, + "learning_rate": 1e-05, + "loss": 0.0662, + "num_tokens": 435292029.0, + "reward": 0.4296875, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999642372131348, + "sampling/importance_sampling_ratio/min": 6.679954094579443e-05, + "sampling/sampling_logp_difference/max": 9.613814353942871, + "sampling/sampling_logp_difference/mean": 0.018761277198791504, + "step": 494 + }, + { + "clip_ratio/high_max": 3.460495008766884e-06, + "clip_ratio/high_mean": 8.65123752191721e-07, + "clip_ratio/low_mean": 7.76378024056612e-05, + "clip_ratio/low_min": 1.7026316072588088e-05, + "clip_ratio/region_mean": 7.850292649891344e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15105.0, + "completions/mean_length": 5753.4140625, + "completions/mean_terminated_length": 5321.2763671875, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "entropy": 0.7848984077572823, + "epoch": 0.45538178472861085, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030854379292577505, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 436046842.0, + "reward": 0.578125, + "reward_std": 0.31405961513519287, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998626708984375, + "sampling/importance_sampling_ratio/min": 4.36544311810394e-09, + "sampling/sampling_logp_difference/max": 19.24954605102539, + "sampling/sampling_logp_difference/mean": 0.017733070999383926, + "step": 495 + }, + { + "clip_ratio/high_max": 1.7207588371093152e-05, + "clip_ratio/high_mean": 4.301897092773288e-06, + "clip_ratio/low_mean": 3.234025916754035e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.664215591925313e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15682.0, + "completions/mean_length": 6522.84375, + "completions/mean_terminated_length": 6445.19677734375, + "completions/min_length": 1062.0, + "completions/min_terminated_length": 1062.0, + "entropy": 1.0593653172254562, + "epoch": 0.4563017479300828, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003124243812635541, + "learning_rate": 1e-05, + "loss": 0.0805, + "num_tokens": 436899638.0, + "reward": 0.4140625, + "reward_std": 0.2706219553947449, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999418258666992, + "sampling/importance_sampling_ratio/min": 4.476920821616659e-06, + "sampling/sampling_logp_difference/max": 12.316575050354004, + "sampling/sampling_logp_difference/mean": 0.021180003881454468, + "step": 496 + }, + { + "clip_ratio/high_max": 1.1790433973146719e-05, + "clip_ratio/high_mean": 2.9476084932866797e-06, + "clip_ratio/low_mean": 2.8437304308681632e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.138491274512489e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14515.0, + "completions/mean_length": 6203.203125, + "completions/mean_terminated_length": 5874.7900390625, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.8152795508503914, + "epoch": 0.45722171113155474, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005001795012503862, + "learning_rate": 1e-05, + "loss": 0.0817, + "num_tokens": 437713008.0, + "reward": 0.4296875, + "reward_std": 0.26143795251846313, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101758003235, + "sampling/importance_sampling_ratio/min": 0.001757707679644227, + "sampling/sampling_logp_difference/max": 6.34374475479126, + "sampling/sampling_logp_difference/mean": 0.017751028761267662, + "step": 497 + }, + { + "clip_ratio/high_max": 1.3163793028070359e-05, + "clip_ratio/high_mean": 4.229499381835922e-06, + "clip_ratio/low_mean": 4.4599403963729856e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.882890357293945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15423.0, + "completions/mean_length": 5975.5234375, + "completions/mean_terminated_length": 5725.72021484375, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "entropy": 0.8275932744145393, + "epoch": 0.45814167433302666, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005084732081741095, + "learning_rate": 1e-05, + "loss": 0.0759, + "num_tokens": 438495811.0, + "reward": 0.5390625, + "reward_std": 0.28513264656066895, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998699426651001, + "sampling/importance_sampling_ratio/min": 3.120788460364565e-05, + "sampling/sampling_logp_difference/max": 10.374839782714844, + "sampling/sampling_logp_difference/mean": 0.018671832978725433, + "step": 498 + }, + { + "clip_ratio/high_max": 3.229640242352616e-06, + "clip_ratio/high_mean": 8.07410060588154e-07, + "clip_ratio/low_mean": 3.0413870263146237e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1221280551108066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16110.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 7019.59375, + "completions/mean_terminated_length": 7019.59375, + "completions/min_length": 1058.0, + "completions/min_terminated_length": 1058.0, + "entropy": 0.9266618490219116, + "epoch": 0.45906163753449863, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002567912917584181, + "learning_rate": 1e-05, + "loss": 0.0282, + "num_tokens": 439413055.0, + "reward": 0.375, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000476837158203, + "sampling/importance_sampling_ratio/min": 0.0010315657127648592, + "sampling/sampling_logp_difference/max": 6.876677513122559, + "sampling/sampling_logp_difference/mean": 0.02012534812092781, + "step": 499 + }, + { + "clip_ratio/high_max": 1.8327779343962902e-05, + "clip_ratio/high_mean": 4.5819448359907256e-06, + "clip_ratio/low_mean": 4.08189575864526e-05, + "clip_ratio/low_min": 4.041122338094283e-06, + "clip_ratio/region_mean": 4.5400901854009135e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7373.3203125, + "completions/mean_terminated_length": 7082.65283203125, + "completions/min_length": 854.0, + "completions/min_terminated_length": 854.0, + "entropy": 0.9383682310581207, + "epoch": 0.45998160073597055, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004862098954617977, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 440375128.0, + "reward": 0.4375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.0006883886526338756, + "sampling/sampling_logp_difference/max": 7.28115701675415, + "sampling/sampling_logp_difference/mean": 0.020596595481038094, + "step": 500 + }, + { + "clip_ratio/high_max": 1.650619151405408e-05, + "clip_ratio/high_mean": 4.12654787851352e-06, + "clip_ratio/low_mean": 6.364750265674957e-05, + "clip_ratio/low_min": 3.94595599573222e-06, + "clip_ratio/region_mean": 6.77740499668289e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16280.0, + "completions/mean_length": 5944.953125, + "completions/mean_terminated_length": 5862.755859375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "entropy": 0.9130716845393181, + "epoch": 0.4609015639374425, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041388699784875, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 441156306.0, + "reward": 0.3984375, + "reward_std": 0.3345639705657959, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999566078186035, + "sampling/importance_sampling_ratio/min": 0.0007685241289436817, + "sampling/sampling_logp_difference/max": 7.171038627624512, + "sampling/sampling_logp_difference/mean": 0.019817989319562912, + "step": 501 + }, + { + "clip_ratio/high_max": 2.9951792839710834e-05, + "clip_ratio/high_mean": 9.205811807078135e-06, + "clip_ratio/low_mean": 3.147234815514821e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0678160075913183e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16181.0, + "completions/mean_length": 6686.015625, + "completions/mean_terminated_length": 6609.6533203125, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 0.8640913739800453, + "epoch": 0.46182152713891444, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005679543130099773, + "learning_rate": 1e-05, + "loss": 0.0306, + "num_tokens": 442032972.0, + "reward": 0.5546875, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 0.007731473073363304, + "sampling/sampling_logp_difference/max": 4.86245584487915, + "sampling/sampling_logp_difference/mean": 0.019738182425498962, + "step": 502 + }, + { + "clip_ratio/high_max": 3.0190597726686974e-05, + "clip_ratio/high_mean": 7.5476494316717435e-06, + "clip_ratio/low_mean": 3.858067566397949e-05, + "clip_ratio/low_min": 9.290916750614997e-06, + "clip_ratio/region_mean": 4.612832617567619e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15951.0, + "completions/mean_length": 6945.5, + "completions/mean_terminated_length": 6231.6640625, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "entropy": 0.8156519457697868, + "epoch": 0.46274149034038636, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006176612339913845, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 442940940.0, + "reward": 0.46875, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999117851257324, + "sampling/importance_sampling_ratio/min": 0.00018278000061400235, + "sampling/sampling_logp_difference/max": 8.607227325439453, + "sampling/sampling_logp_difference/mean": 0.01836501806974411, + "step": 503 + }, + { + "clip_ratio/high_max": 2.2105000425653998e-05, + "clip_ratio/high_mean": 6.28071654773521e-06, + "clip_ratio/low_mean": 3.060894187001395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6889658531436e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15847.0, + "completions/mean_length": 8068.5390625, + "completions/mean_terminated_length": 7363.8388671875, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "entropy": 0.8196670189499855, + "epoch": 0.46366145354185834, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021770994644612074, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 443992041.0, + "reward": 0.4453125, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999759197235107, + "sampling/importance_sampling_ratio/min": 0.0001795605494407937, + "sampling/sampling_logp_difference/max": 8.624998092651367, + "sampling/sampling_logp_difference/mean": 0.019003838300704956, + "step": 504 + }, + { + "clip_ratio/high_max": 1.287241002501105e-05, + "clip_ratio/high_mean": 3.2181025062527624e-06, + "clip_ratio/low_mean": 4.5685408849749365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.89035115833758e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15168.0, + "completions/mean_length": 5209.140625, + "completions/mean_terminated_length": 5031.76220703125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.8851845487952232, + "epoch": 0.46458141674333026, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00788798462599516, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 444679675.0, + "reward": 0.4609375, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796748161316, + "sampling/importance_sampling_ratio/min": 0.00025673024356365204, + "sampling/sampling_logp_difference/max": 8.267484664916992, + "sampling/sampling_logp_difference/mean": 0.018808994442224503, + "step": 505 + }, + { + "clip_ratio/high_max": 2.294301202709903e-05, + "clip_ratio/high_mean": 6.590465602585027e-06, + "clip_ratio/low_mean": 5.944662643742049e-05, + "clip_ratio/low_min": 8.106994755507912e-06, + "clip_ratio/region_mean": 6.603709243790945e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16259.0, + "completions/mean_length": 7558.8984375, + "completions/mean_terminated_length": 7274.21728515625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.003449946641922, + "epoch": 0.46550137994480223, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004547314252704382, + "learning_rate": 1e-05, + "loss": 0.1586, + "num_tokens": 445668126.0, + "reward": 0.421875, + "reward_std": 0.42293959856033325, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999848484992981, + "sampling/importance_sampling_ratio/min": 0.00011622780584730208, + "sampling/sampling_logp_difference/max": 9.059958457946777, + "sampling/sampling_logp_difference/mean": 0.02099413052201271, + "step": 506 + }, + { + "clip_ratio/high_max": 2.1350435872591333e-05, + "clip_ratio/high_mean": 6.047981628398702e-06, + "clip_ratio/low_mean": 8.880347786544007e-05, + "clip_ratio/low_min": 9.06585455595632e-06, + "clip_ratio/region_mean": 9.485145938015194e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16137.0, + "completions/max_terminated_length": 16137.0, + "completions/mean_length": 6066.6015625, + "completions/mean_terminated_length": 6066.6015625, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "entropy": 0.8450648710131645, + "epoch": 0.46642134314627415, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004621773958206177, + "learning_rate": 1e-05, + "loss": 0.121, + "num_tokens": 446464587.0, + "reward": 0.5390625, + "reward_std": 0.3537652790546417, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000154972076416, + "sampling/importance_sampling_ratio/min": 1.3950601896794979e-05, + "sampling/sampling_logp_difference/max": 11.179987907409668, + "sampling/sampling_logp_difference/mean": 0.018016980960965157, + "step": 507 + }, + { + "clip_ratio/high_max": 3.0534724828612525e-06, + "clip_ratio/high_mean": 7.633681207153131e-07, + "clip_ratio/low_mean": 2.149350007130124e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2256868305703392e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 6988.0234375, + "completions/mean_terminated_length": 6838.88134765625, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 1.0452716201543808, + "epoch": 0.46734130634774607, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004523546434938908, + "learning_rate": 1e-05, + "loss": 0.0396, + "num_tokens": 447381134.0, + "reward": 0.3515625, + "reward_std": 0.22567617893218994, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999901056289673, + "sampling/importance_sampling_ratio/min": 0.016167031601071358, + "sampling/sampling_logp_difference/max": 4.124781131744385, + "sampling/sampling_logp_difference/mean": 0.021812722086906433, + "step": 508 + }, + { + "clip_ratio/high_max": 5.58759120394825e-06, + "clip_ratio/high_mean": 1.3968978009870625e-06, + "clip_ratio/low_mean": 3.684896307731833e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.824586099199223e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12316.0, + "completions/max_terminated_length": 12316.0, + "completions/mean_length": 5948.5, + "completions/mean_terminated_length": 5948.5, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "entropy": 0.8241566568613052, + "epoch": 0.46826126954921804, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004002885892987251, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 448158014.0, + "reward": 0.5703125, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999228715896606, + "sampling/importance_sampling_ratio/min": 0.0008566387114115059, + "sampling/sampling_logp_difference/max": 7.062494277954102, + "sampling/sampling_logp_difference/mean": 0.018487900495529175, + "step": 509 + }, + { + "clip_ratio/high_max": 1.0490723752809572e-05, + "clip_ratio/high_mean": 3.439610338773491e-06, + "clip_ratio/low_mean": 3.973086239739132e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3170473020381905e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16044.0, + "completions/mean_length": 7966.375, + "completions/mean_terminated_length": 7764.3525390625, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "entropy": 0.8868448063731194, + "epoch": 0.46918123275068996, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019062751671299338, + "learning_rate": 1e-05, + "loss": 0.0787, + "num_tokens": 449197054.0, + "reward": 0.40625, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0001614262000657618, + "sampling/sampling_logp_difference/max": 8.731462478637695, + "sampling/sampling_logp_difference/mean": 0.020015282556414604, + "step": 510 + }, + { + "clip_ratio/high_max": 1.2195105682621943e-05, + "clip_ratio/high_mean": 3.0487764206554857e-06, + "clip_ratio/low_mean": 3.558348203114292e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8632259474979946e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 6520.0234375, + "completions/mean_terminated_length": 6442.3544921875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9168323278427124, + "epoch": 0.47010119595216193, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00490277074277401, + "learning_rate": 1e-05, + "loss": 0.0547, + "num_tokens": 450050153.0, + "reward": 0.484375, + "reward_std": 0.3437528908252716, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 4.4418397919798736e-06, + "sampling/sampling_logp_difference/max": 12.324441909790039, + "sampling/sampling_logp_difference/mean": 0.020178331062197685, + "step": 511 + }, + { + "clip_ratio/high_max": 7.95772848505294e-06, + "clip_ratio/high_mean": 1.989432121263235e-06, + "clip_ratio/low_mean": 3.363800146871654e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.562743381735345e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16252.0, + "completions/mean_length": 6614.5625, + "completions/mean_terminated_length": 6217.4306640625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.8635925352573395, + "epoch": 0.47102115915363385, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003792276605963707, + "learning_rate": 1e-05, + "loss": -0.0023, + "num_tokens": 450915281.0, + "reward": 0.5, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999154806137085, + "sampling/importance_sampling_ratio/min": 0.004489119164645672, + "sampling/sampling_logp_difference/max": 5.40609884262085, + "sampling/sampling_logp_difference/mean": 0.019233014434576035, + "step": 512 + }, + { + "clip_ratio/high_max": 1.6306271390931215e-05, + "clip_ratio/high_mean": 6.67555605105008e-06, + "clip_ratio/low_mean": 3.4846169796765025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1521726302562456e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16206.0, + "completions/mean_length": 6458.5078125, + "completions/mean_terminated_length": 5970.36865234375, + "completions/min_length": 1025.0, + "completions/min_terminated_length": 1025.0, + "entropy": 0.8816124573349953, + "epoch": 0.47194112235510577, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031763892620801926, + "learning_rate": 1e-05, + "loss": 0.0287, + "num_tokens": 451761322.0, + "reward": 0.4921875, + "reward_std": 0.282474160194397, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999036192893982, + "sampling/importance_sampling_ratio/min": 9.611394489184022e-05, + "sampling/sampling_logp_difference/max": 9.24997615814209, + "sampling/sampling_logp_difference/mean": 0.01935420371592045, + "step": 513 + }, + { + "clip_ratio/high_max": 7.861634912842419e-06, + "clip_ratio/high_mean": 3.0314158721012063e-06, + "clip_ratio/low_mean": 2.2518463538290234e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.554987941039144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 5844.03125, + "completions/mean_terminated_length": 5676.73046875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.9008020162582397, + "epoch": 0.47286108555657774, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004134794697165489, + "learning_rate": 1e-05, + "loss": 0.1094, + "num_tokens": 452526342.0, + "reward": 0.546875, + "reward_std": 0.28930899500846863, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999297857284546, + "sampling/importance_sampling_ratio/min": 0.00012955136480741203, + "sampling/sampling_logp_difference/max": 8.951433181762695, + "sampling/sampling_logp_difference/mean": 0.02013866975903511, + "step": 514 + }, + { + "clip_ratio/high_max": 1.2711160707112867e-05, + "clip_ratio/high_mean": 3.177790176778217e-06, + "clip_ratio/low_mean": 2.444096298859222e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.761875293799676e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 6214.5859375, + "completions/mean_terminated_length": 6134.51171875, + "completions/min_length": 1096.0, + "completions/min_terminated_length": 1096.0, + "entropy": 0.9522949978709221, + "epoch": 0.47378104875804966, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022520655766129494, + "learning_rate": 1e-05, + "loss": 0.0193, + "num_tokens": 453343385.0, + "reward": 0.4921875, + "reward_std": 0.20623260736465454, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999879598617554, + "sampling/importance_sampling_ratio/min": 3.763851054827683e-05, + "sampling/sampling_logp_difference/max": 10.187482833862305, + "sampling/sampling_logp_difference/mean": 0.019947605207562447, + "step": 515 + }, + { + "clip_ratio/high_max": 5.724247012039996e-05, + "clip_ratio/high_mean": 1.431061753009999e-05, + "clip_ratio/low_mean": 3.371703428456385e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.8027652155724354e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14376.0, + "completions/mean_length": 7138.515625, + "completions/mean_terminated_length": 7065.71630859375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.8856206461787224, + "epoch": 0.47470101195952163, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004887089133262634, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 454275379.0, + "reward": 0.4609375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999544620513916, + "sampling/importance_sampling_ratio/min": 0.004931141622364521, + "sampling/sampling_logp_difference/max": 5.312184810638428, + "sampling/sampling_logp_difference/mean": 0.019449077546596527, + "step": 516 + }, + { + "clip_ratio/high_max": 1.5607688055752078e-05, + "clip_ratio/high_mean": 3.9019220139380195e-06, + "clip_ratio/low_mean": 4.936055870530254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.326248106030107e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15855.0, + "completions/mean_length": 6077.796875, + "completions/mean_terminated_length": 5915.00830078125, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.862022191286087, + "epoch": 0.47562097516099355, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003875041613355279, + "learning_rate": 1e-05, + "loss": 0.0366, + "num_tokens": 455076625.0, + "reward": 0.4921875, + "reward_std": 0.23933593928813934, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000392198562622, + "sampling/importance_sampling_ratio/min": 3.322543852846138e-05, + "sampling/sampling_logp_difference/max": 10.31219482421875, + "sampling/sampling_logp_difference/mean": 0.018907926976680756, + "step": 517 + }, + { + "clip_ratio/high_max": 1.0557040241110371e-05, + "clip_ratio/high_mean": 3.535163386914064e-06, + "clip_ratio/low_mean": 3.7409978290270374e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0945141790871276e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 6211.65625, + "completions/mean_terminated_length": 6211.65625, + "completions/min_length": 1292.0, + "completions/min_terminated_length": 1292.0, + "entropy": 0.8835236355662346, + "epoch": 0.4765409383624655, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004288897849619389, + "learning_rate": 1e-05, + "loss": 0.0822, + "num_tokens": 455889693.0, + "reward": 0.53125, + "reward_std": 0.27145031094551086, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999270439147949, + "sampling/importance_sampling_ratio/min": 2.5614745027269237e-06, + "sampling/sampling_logp_difference/max": 12.874927520751953, + "sampling/sampling_logp_difference/mean": 0.01986120268702507, + "step": 518 + }, + { + "clip_ratio/high_max": 2.842265530489385e-06, + "clip_ratio/high_mean": 7.105663826223463e-07, + "clip_ratio/low_mean": 3.578249538804812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.649306199804414e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16128.0, + "completions/mean_length": 7035.609375, + "completions/mean_terminated_length": 6962.0, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "entropy": 0.9033957049250603, + "epoch": 0.47746090156393745, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.004230308346450329, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 456809643.0, + "reward": 0.3203125, + "reward_std": 0.17282497882843018, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999722242355347, + "sampling/importance_sampling_ratio/min": 1.670435995038133e-05, + "sampling/sampling_logp_difference/max": 10.99984073638916, + "sampling/sampling_logp_difference/mean": 0.020262110978364944, + "step": 519 + }, + { + "clip_ratio/high_max": 3.539844283295679e-05, + "clip_ratio/high_mean": 9.844010264714598e-06, + "clip_ratio/low_mean": 2.8534720058814855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.837873060774655e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16241.0, + "completions/mean_length": 6557.40625, + "completions/mean_terminated_length": 6321.568359375, + "completions/min_length": 1136.0, + "completions/min_terminated_length": 1136.0, + "entropy": 0.8352414071559906, + "epoch": 0.47838086476540936, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0029154124204069376, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 457669431.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 5.8480534789850935e-05, + "sampling/sampling_logp_difference/max": 9.746816635131836, + "sampling/sampling_logp_difference/mean": 0.019474683329463005, + "step": 520 + }, + { + "clip_ratio/high_max": 6.400114170901361e-05, + "clip_ratio/high_mean": 1.917558859076962e-05, + "clip_ratio/low_mean": 5.166920755073079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.084479466357152e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15428.0, + "completions/mean_length": 6444.1328125, + "completions/mean_terminated_length": 6205.576171875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "entropy": 0.7480100840330124, + "epoch": 0.47930082796688134, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025195449125021696, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 458512648.0, + "reward": 0.515625, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999996542930603, + "sampling/importance_sampling_ratio/min": 2.4302940801135264e-05, + "sampling/sampling_logp_difference/max": 10.624913215637207, + "sampling/sampling_logp_difference/mean": 0.01779567077755928, + "step": 521 + }, + { + "clip_ratio/high_max": 2.748944325503544e-06, + "clip_ratio/high_mean": 6.87236081375886e-07, + "clip_ratio/low_mean": 3.4855478702411347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5542715181691165e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15868.0, + "completions/mean_length": 6615.234375, + "completions/mean_terminated_length": 6380.7841796875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.8428665772080421, + "epoch": 0.48022079116835326, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004339073318988085, + "learning_rate": 1e-05, + "loss": 0.0608, + "num_tokens": 459377790.0, + "reward": 0.5234375, + "reward_std": 0.31064465641975403, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999370574951172, + "sampling/importance_sampling_ratio/min": 0.00042492515058256686, + "sampling/sampling_logp_difference/max": 7.76359748840332, + "sampling/sampling_logp_difference/mean": 0.018815383315086365, + "step": 522 + }, + { + "clip_ratio/high_max": 2.2513844896820956e-05, + "clip_ratio/high_mean": 7.496596083456097e-06, + "clip_ratio/low_mean": 2.2591082483813807e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0087678169365972e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15239.0, + "completions/mean_length": 6200.3203125, + "completions/mean_terminated_length": 5955.912109375, + "completions/min_length": 1032.0, + "completions/min_terminated_length": 1032.0, + "entropy": 0.9044734612107277, + "epoch": 0.48114075436982523, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005003004334867001, + "learning_rate": 1e-05, + "loss": 0.0502, + "num_tokens": 460189823.0, + "reward": 0.484375, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999645948410034, + "sampling/importance_sampling_ratio/min": 0.005019097588956356, + "sampling/sampling_logp_difference/max": 5.2945051193237305, + "sampling/sampling_logp_difference/mean": 0.0192951001226902, + "step": 523 + }, + { + "clip_ratio/high_max": 1.9086801785306307e-05, + "clip_ratio/high_mean": 4.771700446326577e-06, + "clip_ratio/low_mean": 3.145246773783583e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.622416772941506e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15706.0, + "completions/max_terminated_length": 15706.0, + "completions/mean_length": 5758.9140625, + "completions/mean_terminated_length": 5758.9140625, + "completions/min_length": 1181.0, + "completions/min_terminated_length": 1181.0, + "entropy": 0.8783154934644699, + "epoch": 0.48206071757129715, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005491400603204966, + "learning_rate": 1e-05, + "loss": 0.0209, + "num_tokens": 460944164.0, + "reward": 0.5859375, + "reward_std": 0.2330428510904312, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.003907227888703346, + "sampling/sampling_logp_difference/max": 5.54492712020874, + "sampling/sampling_logp_difference/mean": 0.019315458834171295, + "step": 524 + }, + { + "clip_ratio/high_max": 1.5554858691757545e-05, + "clip_ratio/high_mean": 3.888714672939386e-06, + "clip_ratio/low_mean": 9.616303373150004e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.3505018273463065e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15536.0, + "completions/mean_length": 7573.375, + "completions/mean_terminated_length": 7504.0, + "completions/min_length": 1579.0, + "completions/min_terminated_length": 1579.0, + "entropy": 1.057753436267376, + "epoch": 0.48298068077276907, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0038622859865427017, + "learning_rate": 1e-05, + "loss": 0.0103, + "num_tokens": 461931916.0, + "reward": 0.3125, + "reward_std": 0.14123955368995667, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.002133321948349476, + "sampling/sampling_logp_difference/max": 6.1500749588012695, + "sampling/sampling_logp_difference/mean": 0.02145528793334961, + "step": 525 + }, + { + "clip_ratio/high_max": 2.2185531634022482e-05, + "clip_ratio/high_mean": 6.324094329102081e-06, + "clip_ratio/low_mean": 4.7102344296945375e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342643908079481e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14553.0, + "completions/mean_length": 7353.0703125, + "completions/mean_terminated_length": 7136.328125, + "completions/min_length": 907.0, + "completions/min_terminated_length": 907.0, + "entropy": 0.9386680871248245, + "epoch": 0.48390064397424104, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002902502194046974, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 462894701.0, + "reward": 0.5234375, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999150037765503, + "sampling/importance_sampling_ratio/min": 0.00492977537214756, + "sampling/sampling_logp_difference/max": 5.312461853027344, + "sampling/sampling_logp_difference/mean": 0.021296534687280655, + "step": 526 + }, + { + "clip_ratio/high_max": 1.8664793969946913e-05, + "clip_ratio/high_mean": 4.666198492486728e-06, + "clip_ratio/low_mean": 5.111583186589996e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.578203035838669e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15851.0, + "completions/mean_length": 7280.953125, + "completions/mean_terminated_length": 6987.30615234375, + "completions/min_length": 1111.0, + "completions/min_terminated_length": 1111.0, + "entropy": 0.9424067437648773, + "epoch": 0.48482060717571296, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002602500608190894, + "learning_rate": 1e-05, + "loss": 0.0546, + "num_tokens": 463849087.0, + "reward": 0.3125, + "reward_std": 0.2290911078453064, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999302625656128, + "sampling/importance_sampling_ratio/min": 4.007156167062931e-05, + "sampling/sampling_logp_difference/max": 10.12484359741211, + "sampling/sampling_logp_difference/mean": 0.020630592480301857, + "step": 527 + }, + { + "clip_ratio/high_max": 3.77411461158772e-05, + "clip_ratio/high_mean": 1.0150766001970624e-05, + "clip_ratio/low_mean": 4.5688502041230095e-05, + "clip_ratio/low_min": 5.72383623875794e-06, + "clip_ratio/region_mean": 5.583926849794807e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14628.0, + "completions/max_terminated_length": 14628.0, + "completions/mean_length": 6520.6328125, + "completions/mean_terminated_length": 6520.6328125, + "completions/min_length": 1459.0, + "completions/min_terminated_length": 1459.0, + "entropy": 0.8501213267445564, + "epoch": 0.48574057037718493, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005743890535086393, + "learning_rate": 1e-05, + "loss": 0.1494, + "num_tokens": 464704336.0, + "reward": 0.3984375, + "reward_std": 0.3413938879966736, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999988079071045, + "sampling/importance_sampling_ratio/min": 5.838880315423012e-05, + "sampling/sampling_logp_difference/max": 9.74838638305664, + "sampling/sampling_logp_difference/mean": 0.018370801582932472, + "step": 528 + }, + { + "clip_ratio/high_max": 9.150254300038796e-06, + "clip_ratio/high_mean": 2.287563575009699e-06, + "clip_ratio/low_mean": 2.1804387529300584e-05, + "clip_ratio/low_min": 3.918126822100021e-06, + "clip_ratio/region_mean": 2.4091951559057634e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14675.0, + "completions/max_terminated_length": 14675.0, + "completions/mean_length": 7111.0, + "completions/mean_terminated_length": 7111.0, + "completions/min_length": 1288.0, + "completions/min_terminated_length": 1288.0, + "entropy": 0.8829544633626938, + "epoch": 0.48666053357865685, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004826955031603575, + "learning_rate": 1e-05, + "loss": 0.0967, + "num_tokens": 465632152.0, + "reward": 0.3984375, + "reward_std": 0.2975040376186371, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999524354934692, + "sampling/importance_sampling_ratio/min": 0.00011604782775975764, + "sampling/sampling_logp_difference/max": 9.061508178710938, + "sampling/sampling_logp_difference/mean": 0.019976403564214706, + "step": 529 + }, + { + "clip_ratio/high_max": 2.3185014015325578e-05, + "clip_ratio/high_mean": 7.603994390592561e-06, + "clip_ratio/low_mean": 4.392900382299558e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.153299889570917e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15132.0, + "completions/mean_length": 7797.7109375, + "completions/mean_terminated_length": 7448.67431640625, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.9747610911726952, + "epoch": 0.48758049678012877, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0028944616205990314, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 466648507.0, + "reward": 0.390625, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999991774559021, + "sampling/importance_sampling_ratio/min": 0.0002612585376482457, + "sampling/sampling_logp_difference/max": 8.25, + "sampling/sampling_logp_difference/mean": 0.020830729976296425, + "step": 530 + }, + { + "clip_ratio/high_max": 1.4947459476388758e-05, + "clip_ratio/high_mean": 3.7368648690971895e-06, + "clip_ratio/low_mean": 4.282657914700394e-05, + "clip_ratio/low_min": 4.545454430626705e-06, + "clip_ratio/region_mean": 4.656344435716164e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 6395.4765625, + "completions/mean_terminated_length": 6316.82666015625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.9015842452645302, + "epoch": 0.48850045998160074, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003612271510064602, + "learning_rate": 1e-05, + "loss": 0.0573, + "num_tokens": 467487976.0, + "reward": 0.4921875, + "reward_std": 0.2664504945278168, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998596906661987, + "sampling/importance_sampling_ratio/min": 1.209868287332938e-06, + "sampling/sampling_logp_difference/max": 13.624999046325684, + "sampling/sampling_logp_difference/mean": 0.01959329843521118, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.8946868863167765e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8946868863167765e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15694.0, + "completions/mean_length": 7298.78125, + "completions/mean_terminated_length": 7154.57177734375, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.9978953301906586, + "epoch": 0.48942042318307266, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002104024635627866, + "learning_rate": 1e-05, + "loss": 0.0104, + "num_tokens": 468445132.0, + "reward": 0.2890625, + "reward_std": 0.2301519513130188, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999783039093018, + "sampling/importance_sampling_ratio/min": 5.157754640094936e-05, + "sampling/sampling_logp_difference/max": 9.872424125671387, + "sampling/sampling_logp_difference/mean": 0.021517785266041756, + "step": 532 + }, + { + "clip_ratio/high_max": 2.0034196040796814e-05, + "clip_ratio/high_mean": 6.441706659643387e-06, + "clip_ratio/low_mean": 3.0451521752183908e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.689322829814046e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16003.0, + "completions/mean_length": 7021.53125, + "completions/mean_terminated_length": 6561.08154296875, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.9539581760764122, + "epoch": 0.49034038638454464, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0009346248698420823, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 469360760.0, + "reward": 0.375, + "reward_std": 0.20069600641727448, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999547004699707, + "sampling/importance_sampling_ratio/min": 0.0029978419188410044, + "sampling/sampling_logp_difference/max": 5.8098626136779785, + "sampling/sampling_logp_difference/mean": 0.020538944751024246, + "step": 533 + }, + { + "clip_ratio/high_max": 7.874939228713629e-06, + "clip_ratio/high_mean": 1.968734807178407e-06, + "clip_ratio/low_mean": 3.2224923302237585e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.419365827994625e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15370.0, + "completions/max_terminated_length": 15370.0, + "completions/mean_length": 6988.2109375, + "completions/mean_terminated_length": 6988.2109375, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "entropy": 0.9471191540360451, + "epoch": 0.49126034958601655, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002331435214728117, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 470274859.0, + "reward": 0.3203125, + "reward_std": 0.23751860857009888, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000002145767212, + "sampling/importance_sampling_ratio/min": 0.0015642779180780053, + "sampling/sampling_logp_difference/max": 6.460330963134766, + "sampling/sampling_logp_difference/mean": 0.02088295854628086, + "step": 534 + }, + { + "clip_ratio/high_max": 1.2364610256554442e-05, + "clip_ratio/high_mean": 3.0911525641386106e-06, + "clip_ratio/low_mean": 3.8229277151913266e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.132042954552162e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16212.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 7557.453125, + "completions/mean_terminated_length": 7557.453125, + "completions/min_length": 1064.0, + "completions/min_terminated_length": 1064.0, + "entropy": 0.9897207245230675, + "epoch": 0.4921803127874885, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004562230780720711, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 471263997.0, + "reward": 0.4765625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 0.0001586318830959499, + "sampling/sampling_logp_difference/max": 8.748924255371094, + "sampling/sampling_logp_difference/mean": 0.02160259149968624, + "step": 535 + }, + { + "clip_ratio/high_max": 2.6050724500237266e-05, + "clip_ratio/high_mean": 7.420082738462952e-06, + "clip_ratio/low_mean": 5.8747830053107464e-05, + "clip_ratio/low_min": 1.3906133062846493e-05, + "clip_ratio/region_mean": 6.616791324631777e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15603.0, + "completions/mean_length": 6532.1953125, + "completions/mean_terminated_length": 6295.75244140625, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.9109068289399147, + "epoch": 0.49310027598896045, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004525062162429094, + "learning_rate": 1e-05, + "loss": 0.0219, + "num_tokens": 472120622.0, + "reward": 0.4296875, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999650120735168, + "sampling/importance_sampling_ratio/min": 1.474883083574241e-05, + "sampling/sampling_logp_difference/max": 11.124346733093262, + "sampling/sampling_logp_difference/mean": 0.019527796655893326, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.90738064766083e-05, + "clip_ratio/low_min": 1.0626089533616323e-05, + "clip_ratio/region_mean": 3.90738064766083e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15011.0, + "completions/mean_length": 5994.40625, + "completions/mean_terminated_length": 5912.5986328125, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "entropy": 0.9276224821805954, + "epoch": 0.49402023919043236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005058468785136938, + "learning_rate": 1e-05, + "loss": 0.0165, + "num_tokens": 472906346.0, + "reward": 0.421875, + "reward_std": 0.19044627249240875, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999456405639648, + "sampling/importance_sampling_ratio/min": 0.0005196271813474596, + "sampling/sampling_logp_difference/max": 7.562398910522461, + "sampling/sampling_logp_difference/mean": 0.020568232983350754, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.992188062009518e-05, + "clip_ratio/low_min": 1.2131874427723233e-05, + "clip_ratio/region_mean": 5.992188062009518e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15992.0, + "completions/mean_length": 6469.046875, + "completions/mean_terminated_length": 6311.6669921875, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.9536962807178497, + "epoch": 0.49494020239190434, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.007286665495485067, + "learning_rate": 1e-05, + "loss": 0.1282, + "num_tokens": 473756256.0, + "reward": 0.3515625, + "reward_std": 0.35772189497947693, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000038146972656, + "sampling/importance_sampling_ratio/min": 6.244324322324246e-05, + "sampling/sampling_logp_difference/max": 9.681252479553223, + "sampling/sampling_logp_difference/mean": 0.019624462351202965, + "step": 538 + }, + { + "clip_ratio/high_max": 1.0018506145570427e-05, + "clip_ratio/high_mean": 2.504626536392607e-06, + "clip_ratio/low_mean": 3.329443018174061e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.57990563770727e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15383.0, + "completions/max_terminated_length": 15383.0, + "completions/mean_length": 5778.703125, + "completions/mean_terminated_length": 5778.703125, + "completions/min_length": 903.0, + "completions/min_terminated_length": 903.0, + "entropy": 0.9274095296859741, + "epoch": 0.49586016559337626, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0031439310405403376, + "learning_rate": 1e-05, + "loss": -0.0091, + "num_tokens": 474515194.0, + "reward": 0.3828125, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000576972961426, + "sampling/importance_sampling_ratio/min": 0.0006267410353757441, + "sampling/sampling_logp_difference/max": 7.374977111816406, + "sampling/sampling_logp_difference/mean": 0.019796252250671387, + "step": 539 + }, + { + "clip_ratio/high_max": 3.1761268928676145e-05, + "clip_ratio/high_mean": 9.23904565297562e-06, + "clip_ratio/low_mean": 4.140612338687788e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.064516949460085e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16146.0, + "completions/max_terminated_length": 16146.0, + "completions/mean_length": 6400.75, + "completions/mean_terminated_length": 6400.75, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 0.8927748426795006, + "epoch": 0.49678012879484823, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0039032045751810074, + "learning_rate": 1e-05, + "loss": 0.0938, + "num_tokens": 475355186.0, + "reward": 0.5546875, + "reward_std": 0.3135277032852173, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880194664001, + "sampling/importance_sampling_ratio/min": 4.19893694925122e-06, + "sampling/sampling_logp_difference/max": 12.3806791305542, + "sampling/sampling_logp_difference/mean": 0.019878748804330826, + "step": 540 + }, + { + "clip_ratio/high_max": 2.524126966818585e-05, + "clip_ratio/high_mean": 7.227385253827379e-06, + "clip_ratio/low_mean": 5.609390495919797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.332129100883321e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14347.0, + "completions/mean_length": 7150.234375, + "completions/mean_terminated_length": 6928.62451171875, + "completions/min_length": 1548.0, + "completions/min_terminated_length": 1548.0, + "entropy": 0.8632503524422646, + "epoch": 0.49770009199632015, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004979084711521864, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 476289752.0, + "reward": 0.4765625, + "reward_std": 0.3369181156158447, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999991059303284, + "sampling/importance_sampling_ratio/min": 0.0004304716712795198, + "sampling/sampling_logp_difference/max": 7.75062894821167, + "sampling/sampling_logp_difference/mean": 0.019658904522657394, + "step": 541 + }, + { + "clip_ratio/high_max": 2.5298505988757825e-05, + "clip_ratio/high_mean": 6.324626497189456e-06, + "clip_ratio/low_mean": 3.922748987861269e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.555211648948898e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16353.0, + "completions/mean_length": 6855.6640625, + "completions/mean_terminated_length": 6704.4208984375, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.8328540697693825, + "epoch": 0.49862005519779207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003560611279681325, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 477186885.0, + "reward": 0.515625, + "reward_std": 0.2743411958217621, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998643398284912, + "sampling/importance_sampling_ratio/min": 0.00021035241661593318, + "sampling/sampling_logp_difference/max": 8.466726303100586, + "sampling/sampling_logp_difference/mean": 0.01880962960422039, + "step": 542 + }, + { + "clip_ratio/high_max": 8.90761498339998e-06, + "clip_ratio/high_mean": 2.226903745849995e-06, + "clip_ratio/low_mean": 5.487640487444878e-05, + "clip_ratio/low_min": 6.345177553157555e-06, + "clip_ratio/region_mean": 5.7103308108708006e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15880.0, + "completions/mean_length": 7117.1015625, + "completions/mean_terminated_length": 6818.1689453125, + "completions/min_length": 1067.0, + "completions/min_terminated_length": 1067.0, + "entropy": 0.9280833601951599, + "epoch": 0.49954001839926404, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037869063671678305, + "learning_rate": 1e-05, + "loss": 0.0773, + "num_tokens": 478121506.0, + "reward": 0.484375, + "reward_std": 0.2919674217700958, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999738931655884, + "sampling/importance_sampling_ratio/min": 3.256524507833092e-07, + "sampling/sampling_logp_difference/max": 14.937435150146484, + "sampling/sampling_logp_difference/mean": 0.0203043594956398, + "step": 543 + }, + { + "clip_ratio/high_max": 1.3482746680892888e-05, + "clip_ratio/high_mean": 3.370686670223222e-06, + "clip_ratio/low_mean": 3.976425330165512e-05, + "clip_ratio/low_min": 4.979286131856497e-06, + "clip_ratio/region_mean": 4.313493991503492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16046.0, + "completions/mean_length": 6885.7109375, + "completions/mean_terminated_length": 6734.94482421875, + "completions/min_length": 1184.0, + "completions/min_terminated_length": 1184.0, + "entropy": 0.9137701392173767, + "epoch": 0.500459981600736, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002787451259791851, + "learning_rate": 1e-05, + "loss": 0.0847, + "num_tokens": 479021365.0, + "reward": 0.5, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000042915344238, + "sampling/importance_sampling_ratio/min": 0.0013747947523370385, + "sampling/sampling_logp_difference/max": 6.589450836181641, + "sampling/sampling_logp_difference/mean": 0.02060278132557869, + "step": 544 + }, + { + "clip_ratio/high_max": 2.918380459959735e-05, + "clip_ratio/high_mean": 8.077826691987866e-06, + "clip_ratio/low_mean": 4.93504342102824e-05, + "clip_ratio/low_min": 5.1258921303087845e-06, + "clip_ratio/region_mean": 5.742826124333078e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15047.0, + "completions/mean_length": 7055.7265625, + "completions/mean_terminated_length": 6982.275390625, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "entropy": 1.1009352952241898, + "epoch": 0.5013799448022079, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005555091425776482, + "learning_rate": 1e-05, + "loss": 0.0225, + "num_tokens": 479951778.0, + "reward": 0.28125, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 2.7657671353154e-07, + "sampling/sampling_logp_difference/max": 15.100777626037598, + "sampling/sampling_logp_difference/mean": 0.02176634594798088, + "step": 545 + }, + { + "clip_ratio/high_max": 9.75229158939328e-06, + "clip_ratio/high_mean": 2.43807289734832e-06, + "clip_ratio/low_mean": 3.58120408918694e-05, + "clip_ratio/low_min": 5.571651399804978e-06, + "clip_ratio/region_mean": 3.825011424396507e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16100.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 6088.2109375, + "completions/mean_terminated_length": 6088.2109375, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.7534168809652328, + "epoch": 0.5022999080036799, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.00568060576915741, + "learning_rate": 1e-05, + "loss": 0.1423, + "num_tokens": 480749677.0, + "reward": 0.6484375, + "reward_std": 0.3729842007160187, + "rewards/accuracy_reward/mean": 0.6484375, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999527931213379, + "sampling/importance_sampling_ratio/min": 0.0002166072663385421, + "sampling/sampling_logp_difference/max": 8.437424659729004, + "sampling/sampling_logp_difference/mean": 0.017093103379011154, + "step": 546 + }, + { + "clip_ratio/high_max": 1.821310434024781e-05, + "clip_ratio/high_mean": 4.5532760850619525e-06, + "clip_ratio/low_mean": 2.870424191314669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.325751754346129e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16029.0, + "completions/mean_length": 5638.8515625, + "completions/mean_terminated_length": 5380.96826171875, + "completions/min_length": 1352.0, + "completions/min_terminated_length": 1352.0, + "entropy": 0.8868100792169571, + "epoch": 0.5032198712051518, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019015485886484385, + "learning_rate": 1e-05, + "loss": 0.1025, + "num_tokens": 481489954.0, + "reward": 0.59375, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999911904335022, + "sampling/importance_sampling_ratio/min": 0.0001796126161934808, + "sampling/sampling_logp_difference/max": 8.62470817565918, + "sampling/sampling_logp_difference/mean": 0.019102448597550392, + "step": 547 + }, + { + "clip_ratio/high_max": 2.3414544557454064e-05, + "clip_ratio/high_mean": 7.0229532411758555e-06, + "clip_ratio/low_mean": 3.169551814607985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8718471842003055e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15258.0, + "completions/mean_length": 6776.59375, + "completions/mean_terminated_length": 6624.095703125, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.9075161814689636, + "epoch": 0.5041398344066237, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004203350283205509, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 482375358.0, + "reward": 0.453125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999104738235474, + "sampling/importance_sampling_ratio/min": 0.0036098493728786707, + "sampling/sampling_logp_difference/max": 5.6320695877075195, + "sampling/sampling_logp_difference/mean": 0.019327163696289062, + "step": 548 + }, + { + "clip_ratio/high_max": 1.8746226487564854e-05, + "clip_ratio/high_mean": 5.84939061809564e-06, + "clip_ratio/low_mean": 3.6077018648938974e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.192640903966094e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15684.0, + "completions/mean_length": 7507.59375, + "completions/mean_terminated_length": 7071.048828125, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "entropy": 0.8015655726194382, + "epoch": 0.5050597976080957, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004891456104815006, + "learning_rate": 1e-05, + "loss": 0.0499, + "num_tokens": 483357450.0, + "reward": 0.3359375, + "reward_std": 0.2772369980812073, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999200701713562, + "sampling/importance_sampling_ratio/min": 0.0032753932755440474, + "sampling/sampling_logp_difference/max": 5.721317291259766, + "sampling/sampling_logp_difference/mean": 0.019086822867393494, + "step": 549 + }, + { + "clip_ratio/high_max": 2.4045971031227964e-05, + "clip_ratio/high_mean": 6.011492757806991e-06, + "clip_ratio/low_mean": 3.096040018135682e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.697189299600723e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 6061.3125, + "completions/mean_terminated_length": 5813.568359375, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.8335569724440575, + "epoch": 0.5059797608095676, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003564947983250022, + "learning_rate": 1e-05, + "loss": 0.028, + "num_tokens": 484153554.0, + "reward": 0.3984375, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999876022338867, + "sampling/importance_sampling_ratio/min": 0.02006213553249836, + "sampling/sampling_logp_difference/max": 3.908921003341675, + "sampling/sampling_logp_difference/mean": 0.018360145390033722, + "step": 550 + }, + { + "clip_ratio/high_max": 9.095339009945747e-06, + "clip_ratio/high_mean": 2.2738347524864366e-06, + "clip_ratio/low_mean": 4.612986276697484e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.840369865632965e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 7312.4921875, + "completions/mean_terminated_length": 7241.06298828125, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.9900097697973251, + "epoch": 0.5068997240110396, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0032013265881687403, + "learning_rate": 1e-05, + "loss": 0.0976, + "num_tokens": 485111601.0, + "reward": 0.3125, + "reward_std": 0.21040895581245422, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999306201934814, + "sampling/importance_sampling_ratio/min": 0.006552733480930328, + "sampling/sampling_logp_difference/max": 5.0278730392456055, + "sampling/sampling_logp_difference/mean": 0.020712960511446, + "step": 551 + }, + { + "clip_ratio/high_max": 1.360053283860907e-05, + "clip_ratio/high_mean": 4.2937051603075815e-06, + "clip_ratio/low_mean": 4.3424448904261226e-05, + "clip_ratio/low_min": 4.718405762105249e-06, + "clip_ratio/region_mean": 4.771815429194248e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14797.0, + "completions/max_terminated_length": 14797.0, + "completions/mean_length": 6571.4453125, + "completions/mean_terminated_length": 6571.4453125, + "completions/min_length": 951.0, + "completions/min_terminated_length": 951.0, + "entropy": 0.8801060244441032, + "epoch": 0.5078196872125115, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002972986316308379, + "learning_rate": 1e-05, + "loss": 0.0888, + "num_tokens": 485971554.0, + "reward": 0.5234375, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998995065689087, + "sampling/importance_sampling_ratio/min": 2.4590379325672984e-05, + "sampling/sampling_logp_difference/max": 10.613155364990234, + "sampling/sampling_logp_difference/mean": 0.020055105909705162, + "step": 552 + }, + { + "clip_ratio/high_max": 8.231255606006016e-06, + "clip_ratio/high_mean": 2.057813901501504e-06, + "clip_ratio/low_mean": 3.511405452627514e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.71718685983069e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 6879.2890625, + "completions/mean_terminated_length": 6728.4208984375, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.8452998399734497, + "epoch": 0.5087396504139834, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00798189826309681, + "learning_rate": 1e-05, + "loss": 0.0278, + "num_tokens": 486873791.0, + "reward": 0.4609375, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493956565857, + "sampling/importance_sampling_ratio/min": 0.005210345610976219, + "sampling/sampling_logp_difference/max": 5.25710916519165, + "sampling/sampling_logp_difference/mean": 0.02010834403336048, + "step": 553 + }, + { + "clip_ratio/high_max": 1.757707786964602e-05, + "clip_ratio/high_mean": 4.394269467411505e-06, + "clip_ratio/low_mean": 6.0756912262149854e-05, + "clip_ratio/low_min": 1.0878021839744179e-05, + "clip_ratio/region_mean": 6.51511809337535e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16237.0, + "completions/max_terminated_length": 16237.0, + "completions/mean_length": 7169.8828125, + "completions/mean_terminated_length": 7169.8828125, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.9671438857913017, + "epoch": 0.5096596136154554, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0038661460857838392, + "learning_rate": 1e-05, + "loss": 0.0389, + "num_tokens": 487814936.0, + "reward": 0.3359375, + "reward_std": 0.23751862347126007, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 4.6830271458020434e-05, + "sampling/sampling_logp_difference/max": 9.96898078918457, + "sampling/sampling_logp_difference/mean": 0.02097059041261673, + "step": 554 + }, + { + "clip_ratio/high_max": 4.649260063160909e-06, + "clip_ratio/high_mean": 1.1623150157902273e-06, + "clip_ratio/low_mean": 3.180719090778439e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2969506037261453e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 6945.0390625, + "completions/mean_terminated_length": 6870.71630859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9309702143073082, + "epoch": 0.5105795768169273, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002214127918705344, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 488720293.0, + "reward": 0.375, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914169311523, + "sampling/importance_sampling_ratio/min": 0.00032080389792099595, + "sampling/sampling_logp_difference/max": 8.04468059539795, + "sampling/sampling_logp_difference/mean": 0.01968962326645851, + "step": 555 + }, + { + "clip_ratio/high_max": 1.5428002825501608e-05, + "clip_ratio/high_mean": 3.857000706375402e-06, + "clip_ratio/low_mean": 5.9988536690980254e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.384553716998198e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5970.1015625, + "completions/mean_terminated_length": 5804.8017578125, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.8274230882525444, + "epoch": 0.5114995400183993, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026088031008839607, + "learning_rate": 1e-05, + "loss": 0.0919, + "num_tokens": 489504626.0, + "reward": 0.484375, + "reward_std": 0.3237725496292114, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999892711639404, + "sampling/importance_sampling_ratio/min": 0.00033548183273524046, + "sampling/sampling_logp_difference/max": 7.999942779541016, + "sampling/sampling_logp_difference/mean": 0.018132124096155167, + "step": 556 + }, + { + "clip_ratio/high_max": 1.628765676287003e-05, + "clip_ratio/high_mean": 5.032566036788921e-06, + "clip_ratio/low_mean": 3.257978141846252e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.761234722787776e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15636.0, + "completions/mean_length": 7099.578125, + "completions/mean_terminated_length": 6952.20654296875, + "completions/min_length": 567.0, + "completions/min_terminated_length": 567.0, + "entropy": 0.8690815567970276, + "epoch": 0.5124195032198712, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0040014018304646015, + "learning_rate": 1e-05, + "loss": 0.0021, + "num_tokens": 490431156.0, + "reward": 0.4609375, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999368786811829, + "sampling/importance_sampling_ratio/min": 0.0007102031959220767, + "sampling/sampling_logp_difference/max": 7.249959468841553, + "sampling/sampling_logp_difference/mean": 0.02036934345960617, + "step": 557 + }, + { + "clip_ratio/high_max": 1.3314914440343273e-05, + "clip_ratio/high_mean": 3.3287286100858182e-06, + "clip_ratio/low_mean": 3.747020150512981e-05, + "clip_ratio/low_min": 3.852436293527717e-06, + "clip_ratio/region_mean": 4.079892983099853e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16301.0, + "completions/mean_length": 7253.296875, + "completions/mean_terminated_length": 6725.07421875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8692722395062447, + "epoch": 0.5133394664213431, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002252641599625349, + "learning_rate": 1e-05, + "loss": 0.0711, + "num_tokens": 491378450.0, + "reward": 0.328125, + "reward_std": 0.2488291710615158, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999855756759644, + "sampling/importance_sampling_ratio/min": 1.893525586638134e-05, + "sampling/sampling_logp_difference/max": 10.87448501586914, + "sampling/sampling_logp_difference/mean": 0.01926814392209053, + "step": 558 + }, + { + "clip_ratio/high_max": 3.51339258486405e-05, + "clip_ratio/high_mean": 1.0567253070803417e-05, + "clip_ratio/low_mean": 3.905345306520758e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962070602232416e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 7827.0234375, + "completions/mean_terminated_length": 7406.18798828125, + "completions/min_length": 808.0, + "completions/min_terminated_length": 808.0, + "entropy": 0.9718392416834831, + "epoch": 0.5142594296228151, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023995323572307825, + "learning_rate": 1e-05, + "loss": 0.0684, + "num_tokens": 492398757.0, + "reward": 0.3359375, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999961256980896, + "sampling/importance_sampling_ratio/min": 0.0003522284678183496, + "sampling/sampling_logp_difference/max": 7.951230525970459, + "sampling/sampling_logp_difference/mean": 0.020725054666399956, + "step": 559 + }, + { + "clip_ratio/high_max": 9.237001677320222e-06, + "clip_ratio/high_mean": 2.3092504193300556e-06, + "clip_ratio/low_mean": 4.477454979223694e-05, + "clip_ratio/low_min": 3.5987793580716243e-06, + "clip_ratio/region_mean": 4.708380049578409e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14833.0, + "completions/max_terminated_length": 14833.0, + "completions/mean_length": 6578.53125, + "completions/mean_terminated_length": 6578.53125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.9265799149870872, + "epoch": 0.515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0053934333845973015, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 493259049.0, + "reward": 0.4140625, + "reward_std": 0.29196250438690186, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999976396560669, + "sampling/importance_sampling_ratio/min": 1.5993017541404697e-06, + "sampling/sampling_logp_difference/max": 13.345943450927734, + "sampling/sampling_logp_difference/mean": 0.019497254863381386, + "step": 560 + }, + { + "clip_ratio/high_max": 6.991247119003674e-06, + "clip_ratio/high_mean": 2.580789669082151e-06, + "clip_ratio/low_mean": 4.2538599473118666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.511938891482714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15783.0, + "completions/mean_length": 7893.7734375, + "completions/mean_terminated_length": 7826.92138671875, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "entropy": 0.9697273746132851, + "epoch": 0.516099356025759, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003773769596591592, + "learning_rate": 1e-05, + "loss": 0.061, + "num_tokens": 494288028.0, + "reward": 0.296875, + "reward_std": 0.29272884130477905, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000444650650024, + "sampling/importance_sampling_ratio/min": 4.6216489863581955e-05, + "sampling/sampling_logp_difference/max": 9.982173919677734, + "sampling/sampling_logp_difference/mean": 0.020743828266859055, + "step": 561 + }, + { + "clip_ratio/high_max": 1.060595786839258e-05, + "clip_ratio/high_mean": 4.29665919909894e-06, + "clip_ratio/low_mean": 3.2997783137034276e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729444244982005e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15024.0, + "completions/mean_length": 6483.7734375, + "completions/mean_terminated_length": 6405.81884765625, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.8293593674898148, + "epoch": 0.5170193192272309, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.006334445904940367, + "learning_rate": 1e-05, + "loss": 0.0217, + "num_tokens": 495135903.0, + "reward": 0.5, + "reward_std": 0.20251333713531494, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999064207077026, + "sampling/importance_sampling_ratio/min": 0.0001236602693097666, + "sampling/sampling_logp_difference/max": 8.99797248840332, + "sampling/sampling_logp_difference/mean": 0.018669776618480682, + "step": 562 + }, + { + "clip_ratio/high_max": 9.357276894661481e-06, + "clip_ratio/high_mean": 2.3393192236653704e-06, + "clip_ratio/low_mean": 4.667806888392079e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.901738748230855e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16230.0, + "completions/mean_length": 6484.546875, + "completions/mean_terminated_length": 6246.96044921875, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.7686850279569626, + "epoch": 0.5179392824287029, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003286323742941022, + "learning_rate": 1e-05, + "loss": 0.0865, + "num_tokens": 495986277.0, + "reward": 0.59375, + "reward_std": 0.3763991594314575, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999945342540741, + "sampling/importance_sampling_ratio/min": 2.0216441043885425e-05, + "sampling/sampling_logp_difference/max": 10.809014320373535, + "sampling/sampling_logp_difference/mean": 0.018656805157661438, + "step": 563 + }, + { + "clip_ratio/high_max": 3.368905208844808e-05, + "clip_ratio/high_mean": 9.76577109668142e-06, + "clip_ratio/low_mean": 8.26880966542376e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8034580989478854e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 6411.3203125, + "completions/mean_terminated_length": 5746.47509765625, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 0.899998240172863, + "epoch": 0.5188592456301748, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005072349216789007, + "learning_rate": 1e-05, + "loss": -0.0049, + "num_tokens": 496826094.0, + "reward": 0.515625, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999135732650757, + "sampling/importance_sampling_ratio/min": 0.0038024066016077995, + "sampling/sampling_logp_difference/max": 5.5721211433410645, + "sampling/sampling_logp_difference/mean": 0.019648944959044456, + "step": 564 + }, + { + "clip_ratio/high_max": 1.726673963275971e-05, + "clip_ratio/high_mean": 6.2551004020861e-06, + "clip_ratio/low_mean": 4.834715275592316e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4602252930635586e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 7110.0546875, + "completions/mean_terminated_length": 6810.89501953125, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 1.0061073675751686, + "epoch": 0.5197792088316467, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005030680447816849, + "learning_rate": 1e-05, + "loss": 0.0871, + "num_tokens": 497756469.0, + "reward": 0.375, + "reward_std": 0.3253750801086426, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999985933303833, + "sampling/importance_sampling_ratio/min": 0.0004307488852646202, + "sampling/sampling_logp_difference/max": 7.749985218048096, + "sampling/sampling_logp_difference/mean": 0.02187274768948555, + "step": 565 + }, + { + "clip_ratio/high_max": 3.3920382520591374e-06, + "clip_ratio/high_mean": 8.480095630147844e-07, + "clip_ratio/low_mean": 2.627351494766117e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.712152416961544e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7546.484375, + "completions/mean_terminated_length": 7261.40283203125, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.898541085422039, + "epoch": 0.5206991720331187, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002894402015954256, + "learning_rate": 1e-05, + "loss": -0.0016, + "num_tokens": 498743411.0, + "reward": 0.25, + "reward_std": 0.2380426526069641, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998988509178162, + "sampling/importance_sampling_ratio/min": 3.340166585985571e-05, + "sampling/sampling_logp_difference/max": 10.306904792785645, + "sampling/sampling_logp_difference/mean": 0.019597206264734268, + "step": 566 + }, + { + "clip_ratio/high_max": 3.407480107853189e-06, + "clip_ratio/high_mean": 8.518700269632973e-07, + "clip_ratio/low_mean": 1.9815101950371172e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.066697197733447e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15426.0, + "completions/mean_length": 6637.9296875, + "completions/mean_terminated_length": 6241.74755859375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "entropy": 0.9469815120100975, + "epoch": 0.5216191352345906, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033100086729973555, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 499612490.0, + "reward": 0.4375, + "reward_std": 0.2782978415489197, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999792575836182, + "sampling/importance_sampling_ratio/min": 0.000214192972634919, + "sampling/sampling_logp_difference/max": 8.448633193969727, + "sampling/sampling_logp_difference/mean": 0.019627269357442856, + "step": 567 + }, + { + "clip_ratio/high_max": 2.8962323767700582e-05, + "clip_ratio/high_mean": 7.2405809419251455e-06, + "clip_ratio/low_mean": 6.551078422489809e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.275136522366665e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15136.0, + "completions/mean_length": 6903.0859375, + "completions/mean_terminated_length": 6752.595703125, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.976447619497776, + "epoch": 0.5225390984360626, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006571728736162186, + "learning_rate": 1e-05, + "loss": 0.0543, + "num_tokens": 500515117.0, + "reward": 0.40625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.016446342691779137, + "sampling/sampling_logp_difference/max": 4.107652187347412, + "sampling/sampling_logp_difference/mean": 0.020653847604990005, + "step": 568 + }, + { + "clip_ratio/high_max": 1.4576415196643211e-05, + "clip_ratio/high_mean": 3.6441037991608027e-06, + "clip_ratio/low_mean": 7.513643731726916e-05, + "clip_ratio/low_min": 2.2551557776750997e-05, + "clip_ratio/region_mean": 7.878054020693526e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15556.0, + "completions/mean_length": 6953.8359375, + "completions/mean_terminated_length": 6570.49560546875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.8397975340485573, + "epoch": 0.5234590616375345, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.007468517404049635, + "learning_rate": 1e-05, + "loss": 0.0618, + "num_tokens": 501427056.0, + "reward": 0.421875, + "reward_std": 0.3571978807449341, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.0001911464933073148, + "sampling/sampling_logp_difference/max": 8.562470436096191, + "sampling/sampling_logp_difference/mean": 0.01937997341156006, + "step": 569 + }, + { + "clip_ratio/high_max": 3.168922489749093e-05, + "clip_ratio/high_mean": 7.922306224372733e-06, + "clip_ratio/low_mean": 3.7468206755875144e-05, + "clip_ratio/low_min": 5.264044375508092e-06, + "clip_ratio/region_mean": 4.5390514060272835e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15961.0, + "completions/mean_length": 7807.09375, + "completions/mean_terminated_length": 7458.43896484375, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "entropy": 0.7974586114287376, + "epoch": 0.5243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004324767272919416, + "learning_rate": 1e-05, + "loss": 0.0431, + "num_tokens": 502445156.0, + "reward": 0.265625, + "reward_std": 0.3329663574695587, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999243021011353, + "sampling/importance_sampling_ratio/min": 2.9874459869461134e-05, + "sampling/sampling_logp_difference/max": 10.418506622314453, + "sampling/sampling_logp_difference/mean": 0.018592730164527893, + "step": 570 + }, + { + "clip_ratio/high_max": 1.8414293663227e-05, + "clip_ratio/high_mean": 5.567038670051261e-06, + "clip_ratio/low_mean": 3.436269958001503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9929738250066293e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 6467.890625, + "completions/mean_terminated_length": 6310.4921875, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "entropy": 0.8665193468332291, + "epoch": 0.5252989880404784, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0044867550022900105, + "learning_rate": 1e-05, + "loss": 0.0434, + "num_tokens": 503293398.0, + "reward": 0.4609375, + "reward_std": 0.2998581528663635, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.024881144985556602, + "sampling/sampling_logp_difference/max": 3.6936450004577637, + "sampling/sampling_logp_difference/mean": 0.019022464752197266, + "step": 571 + }, + { + "clip_ratio/high_max": 1.4845849818811985e-05, + "clip_ratio/high_mean": 3.711462454702996e-06, + "clip_ratio/low_mean": 3.597185968828853e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.968332202930469e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16309.0, + "completions/mean_length": 6275.796875, + "completions/mean_terminated_length": 6115.349609375, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 0.8425783589482307, + "epoch": 0.5262189512419503, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0033805551938712597, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 504115692.0, + "reward": 0.3984375, + "reward_std": 0.2569621503353119, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000152587890625, + "sampling/importance_sampling_ratio/min": 0.018389537930488586, + "sampling/sampling_logp_difference/max": 3.9959733486175537, + "sampling/sampling_logp_difference/mean": 0.018935590982437134, + "step": 572 + }, + { + "clip_ratio/high_max": 4.3129479763592826e-05, + "clip_ratio/high_mean": 1.3471904480866215e-05, + "clip_ratio/low_mean": 1.670091853611666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0172822903296037e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16116.0, + "completions/mean_length": 5396.7890625, + "completions/mean_terminated_length": 5222.38916015625, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.8558806329965591, + "epoch": 0.5271389144434223, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00652205478399992, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 504826577.0, + "reward": 0.546875, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.0017056812066584826, + "sampling/sampling_logp_difference/max": 6.373790740966797, + "sampling/sampling_logp_difference/mean": 0.018737314268946648, + "step": 573 + }, + { + "clip_ratio/high_max": 6.914692676218692e-06, + "clip_ratio/high_mean": 1.728673169054673e-06, + "clip_ratio/low_mean": 2.3435458388121333e-05, + "clip_ratio/low_min": 3.954319709009724e-06, + "clip_ratio/region_mean": 2.5164132239297032e-05, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16298.0, + "completions/mean_length": 7798.9765625, + "completions/mean_terminated_length": 6991.837890625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.8846152648329735, + "epoch": 0.5280588776448942, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018958896398544312, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 505846438.0, + "reward": 0.328125, + "reward_std": 0.21253062784671783, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999515414237976, + "sampling/importance_sampling_ratio/min": 2.434831731079612e-05, + "sampling/sampling_logp_difference/max": 10.623047828674316, + "sampling/sampling_logp_difference/mean": 0.019361287355422974, + "step": 574 + }, + { + "clip_ratio/high_max": 1.085428675651201e-05, + "clip_ratio/high_mean": 5.064732249593362e-06, + "clip_ratio/low_mean": 5.590463968019321e-05, + "clip_ratio/low_min": 4.822531082027126e-06, + "clip_ratio/region_mean": 6.096937283928128e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16280.0, + "completions/mean_length": 6272.5546875, + "completions/mean_terminated_length": 6029.88037109375, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "entropy": 0.9714803844690323, + "epoch": 0.5289788408463661, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003035407979041338, + "learning_rate": 1e-05, + "loss": 0.1295, + "num_tokens": 506670477.0, + "reward": 0.3984375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999212026596069, + "sampling/importance_sampling_ratio/min": 0.0012103202752768993, + "sampling/sampling_logp_difference/max": 6.716870307922363, + "sampling/sampling_logp_difference/mean": 0.019988738000392914, + "step": 575 + }, + { + "clip_ratio/high_max": 2.1176599602767965e-05, + "clip_ratio/high_mean": 5.294149900691991e-06, + "clip_ratio/low_mean": 4.479086726405512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.008501784686814e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6060.75, + "completions/mean_terminated_length": 5896.88916015625, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 0.8791732639074326, + "epoch": 0.5298988040478381, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.005080445669591427, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 507471717.0, + "reward": 0.421875, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999859929084778, + "sampling/importance_sampling_ratio/min": 0.0025768836494535208, + "sampling/sampling_logp_difference/max": 5.961174488067627, + "sampling/sampling_logp_difference/mean": 0.019146449863910675, + "step": 576 + }, + { + "clip_ratio/high_max": 1.591328441463702e-05, + "clip_ratio/high_mean": 3.978321103659255e-06, + "clip_ratio/low_mean": 3.991827338722942e-05, + "clip_ratio/low_min": 4.394445568323135e-06, + "clip_ratio/region_mean": 4.389659511616628e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 7221.65625, + "completions/mean_terminated_length": 7149.51171875, + "completions/min_length": 1071.0, + "completions/min_terminated_length": 1071.0, + "entropy": 0.9068904295563698, + "epoch": 0.53081876724931, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002491918858140707, + "learning_rate": 1e-05, + "loss": 0.0263, + "num_tokens": 508420417.0, + "reward": 0.3046875, + "reward_std": 0.22908622026443481, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999144077301025, + "sampling/importance_sampling_ratio/min": 0.0010015364969149232, + "sampling/sampling_logp_difference/max": 6.906219959259033, + "sampling/sampling_logp_difference/mean": 0.019857721403241158, + "step": 577 + }, + { + "clip_ratio/high_max": 2.723786337810452e-06, + "clip_ratio/high_mean": 6.80946584452613e-07, + "clip_ratio/low_mean": 4.729307283923845e-05, + "clip_ratio/low_min": 3.3817600524344016e-06, + "clip_ratio/region_mean": 4.7974018798413454e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16090.0, + "completions/mean_length": 7279.765625, + "completions/mean_terminated_length": 6909.67431640625, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "entropy": 0.7393763959407806, + "epoch": 0.531738730450782, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0038857783656567335, + "learning_rate": 1e-05, + "loss": 0.1167, + "num_tokens": 509367579.0, + "reward": 0.5703125, + "reward_std": 0.3782213628292084, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999372959136963, + "sampling/importance_sampling_ratio/min": 8.482332486892119e-05, + "sampling/sampling_logp_difference/max": 9.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01783195324242115, + "step": 578 + }, + { + "clip_ratio/high_max": 2.4269288587674964e-05, + "clip_ratio/high_mean": 6.067322146918741e-06, + "clip_ratio/low_mean": 5.770765028501046e-05, + "clip_ratio/low_min": 6.032236342434771e-06, + "clip_ratio/region_mean": 6.377497174980817e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15946.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 5381.4375, + "completions/mean_terminated_length": 5381.4375, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.8337196409702301, + "epoch": 0.5326586936522539, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.004505726508796215, + "learning_rate": 1e-05, + "loss": 0.1534, + "num_tokens": 510076403.0, + "reward": 0.484375, + "reward_std": 0.3861297369003296, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999825358390808, + "sampling/importance_sampling_ratio/min": 0.0021874941885471344, + "sampling/sampling_logp_difference/max": 6.124998569488525, + "sampling/sampling_logp_difference/mean": 0.019285976886749268, + "step": 579 + }, + { + "clip_ratio/high_max": 1.83111833393923e-05, + "clip_ratio/high_mean": 4.577795834848075e-06, + "clip_ratio/low_mean": 4.1738339632502175e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.631613546735025e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15789.0, + "completions/mean_length": 8440.7109375, + "completions/mean_terminated_length": 8250.072265625, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "entropy": 0.8920768201351166, + "epoch": 0.5335786568537259, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0039497604593634605, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 511177974.0, + "reward": 0.1875, + "reward_std": 0.18990950286388397, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910831451416, + "sampling/importance_sampling_ratio/min": 0.00021938055579084903, + "sampling/sampling_logp_difference/max": 8.424702644348145, + "sampling/sampling_logp_difference/mean": 0.020451124757528305, + "step": 580 + }, + { + "clip_ratio/high_max": 1.371111534353986e-05, + "clip_ratio/high_mean": 3.427778835884965e-06, + "clip_ratio/low_mean": 4.171912905803765e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.514690772339236e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16077.0, + "completions/mean_length": 6702.3828125, + "completions/mean_terminated_length": 6470.0244140625, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.8600481152534485, + "epoch": 0.5344986200551978, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024386425502598286, + "learning_rate": 1e-05, + "loss": 0.0866, + "num_tokens": 512054655.0, + "reward": 0.5703125, + "reward_std": 0.26645052433013916, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000202655792236, + "sampling/importance_sampling_ratio/min": 0.0015237311599776149, + "sampling/sampling_logp_difference/max": 6.486593246459961, + "sampling/sampling_logp_difference/mean": 0.018986206501722336, + "step": 581 + }, + { + "clip_ratio/high_max": 9.279537152906414e-06, + "clip_ratio/high_mean": 4.2680171645770315e-06, + "clip_ratio/low_mean": 2.6773893978315755e-05, + "clip_ratio/low_min": 4.736104074254399e-06, + "clip_ratio/region_mean": 3.1041911142892786e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13410.0, + "completions/mean_length": 4845.953125, + "completions/mean_terminated_length": 4755.1025390625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.9067303538322449, + "epoch": 0.5354185832566697, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0072782449424266815, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 512696537.0, + "reward": 0.4296875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999409317970276, + "sampling/importance_sampling_ratio/min": 0.017822081223130226, + "sampling/sampling_logp_difference/max": 4.027317047119141, + "sampling/sampling_logp_difference/mean": 0.01862735114991665, + "step": 582 + }, + { + "clip_ratio/high_max": 8.41807559481822e-06, + "clip_ratio/high_mean": 2.104518898704555e-06, + "clip_ratio/low_mean": 4.360654588708712e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5711064331044327e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16282.0, + "completions/mean_length": 6173.171875, + "completions/mean_terminated_length": 6011.095703125, + "completions/min_length": 756.0, + "completions/min_terminated_length": 756.0, + "entropy": 0.9604142308235168, + "epoch": 0.5363385464581417, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005057654343545437, + "learning_rate": 1e-05, + "loss": 0.0799, + "num_tokens": 513505135.0, + "reward": 0.4375, + "reward_std": 0.2767051160335541, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999635219573975, + "sampling/importance_sampling_ratio/min": 0.0002380619989708066, + "sampling/sampling_logp_difference/max": 8.342979431152344, + "sampling/sampling_logp_difference/mean": 0.020879898220300674, + "step": 583 + }, + { + "clip_ratio/high_max": 7.327939783863258e-06, + "clip_ratio/high_mean": 3.227510205761064e-06, + "clip_ratio/low_mean": 4.2579683963595016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.580719428304292e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15173.0, + "completions/mean_length": 5546.5234375, + "completions/mean_terminated_length": 5374.50048828125, + "completions/min_length": 1113.0, + "completions/min_terminated_length": 1113.0, + "entropy": 0.8015405982732773, + "epoch": 0.5372585096596136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0047672707587480545, + "learning_rate": 1e-05, + "loss": 0.0991, + "num_tokens": 514232058.0, + "reward": 0.4921875, + "reward_std": 0.27038949728012085, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 5.8323133998783305e-05, + "sampling/sampling_logp_difference/max": 9.74951171875, + "sampling/sampling_logp_difference/mean": 0.018185433000326157, + "step": 584 + }, + { + "clip_ratio/high_max": 1.3804907666781219e-05, + "clip_ratio/high_mean": 4.388961428958282e-06, + "clip_ratio/low_mean": 5.04182496570138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.480721097228525e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15778.0, + "completions/mean_length": 6637.359375, + "completions/mean_terminated_length": 6482.6513671875, + "completions/min_length": 1144.0, + "completions/min_terminated_length": 1144.0, + "entropy": 1.0173144191503525, + "epoch": 0.5381784728610856, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005850035231560469, + "learning_rate": 1e-05, + "loss": 0.0453, + "num_tokens": 515103184.0, + "reward": 0.3046875, + "reward_std": 0.24988999962806702, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999963104724884, + "sampling/importance_sampling_ratio/min": 1.4479226706498594e-07, + "sampling/sampling_logp_difference/max": 15.747965812683105, + "sampling/sampling_logp_difference/mean": 0.020641878247261047, + "step": 585 + }, + { + "clip_ratio/high_max": 1.594428704265738e-05, + "clip_ratio/high_mean": 3.986071760664345e-06, + "clip_ratio/low_mean": 5.566071547491447e-05, + "clip_ratio/low_min": 8.978264304460026e-06, + "clip_ratio/region_mean": 5.964678746295249e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 6940.6171875, + "completions/mean_terminated_length": 6866.259765625, + "completions/min_length": 1273.0, + "completions/min_terminated_length": 1273.0, + "entropy": 0.8547529205679893, + "epoch": 0.5390984360625575, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0037875184789299965, + "learning_rate": 1e-05, + "loss": 0.0831, + "num_tokens": 516009791.0, + "reward": 0.4765625, + "reward_std": 0.27222442626953125, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999997615814209, + "sampling/importance_sampling_ratio/min": 5.772008080384694e-06, + "sampling/sampling_logp_difference/max": 12.062490463256836, + "sampling/sampling_logp_difference/mean": 0.018527517095208168, + "step": 586 + }, + { + "clip_ratio/high_max": 6.924382887518732e-06, + "clip_ratio/high_mean": 1.731095721879683e-06, + "clip_ratio/low_mean": 3.340147941344185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5132575476382044e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15387.0, + "completions/mean_length": 6837.125, + "completions/mean_terminated_length": 6761.95263671875, + "completions/min_length": 1319.0, + "completions/min_terminated_length": 1319.0, + "entropy": 0.9027494043111801, + "epoch": 0.5400183992640294, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015506440540775657, + "learning_rate": 1e-05, + "loss": 0.0502, + "num_tokens": 516903335.0, + "reward": 0.296875, + "reward_std": 0.20593318343162537, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 4.2636147554730996e-05, + "sampling/sampling_logp_difference/max": 10.0628080368042, + "sampling/sampling_logp_difference/mean": 0.020130250602960587, + "step": 587 + }, + { + "clip_ratio/high_max": 1.2774215747413109e-05, + "clip_ratio/high_mean": 3.1935539368532773e-06, + "clip_ratio/low_mean": 3.885528553837503e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.204883930469805e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7866.703125, + "completions/mean_terminated_length": 7222.5380859375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.8133657574653625, + "epoch": 0.5409383624655014, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003520917845889926, + "learning_rate": 1e-05, + "loss": 0.1165, + "num_tokens": 517929081.0, + "reward": 0.4453125, + "reward_std": 0.3316730856895447, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999421834945679, + "sampling/importance_sampling_ratio/min": 6.223546370165423e-05, + "sampling/sampling_logp_difference/max": 9.684585571289062, + "sampling/sampling_logp_difference/mean": 0.01890747994184494, + "step": 588 + }, + { + "clip_ratio/high_max": 6.942207619431429e-06, + "clip_ratio/high_mean": 1.7355519048578572e-06, + "clip_ratio/low_mean": 3.457626269209868e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.631181459695654e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 6701.296875, + "completions/mean_terminated_length": 6547.603515625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9360691756010056, + "epoch": 0.5418583256669733, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029796145390719175, + "learning_rate": 1e-05, + "loss": 0.0447, + "num_tokens": 518810247.0, + "reward": 0.3359375, + "reward_std": 0.2869499921798706, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999505877494812, + "sampling/importance_sampling_ratio/min": 2.520391673144218e-10, + "sampling/sampling_logp_difference/max": 22.101436614990234, + "sampling/sampling_logp_difference/mean": 0.01977725327014923, + "step": 589 + }, + { + "clip_ratio/high_max": 3.7906356737948954e-06, + "clip_ratio/high_mean": 9.476589184487239e-07, + "clip_ratio/low_mean": 3.738725240509666e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8334911323545384e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15971.0, + "completions/mean_length": 7029.453125, + "completions/mean_terminated_length": 6804.9443359375, + "completions/min_length": 1180.0, + "completions/min_terminated_length": 1180.0, + "entropy": 0.9168537557125092, + "epoch": 0.5427782888684453, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024249793495982885, + "learning_rate": 1e-05, + "loss": 0.0477, + "num_tokens": 519730577.0, + "reward": 0.390625, + "reward_std": 0.22803518176078796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999786615371704, + "sampling/importance_sampling_ratio/min": 1.6278204384434503e-07, + "sampling/sampling_logp_difference/max": 15.630853652954102, + "sampling/sampling_logp_difference/mean": 0.01923082396388054, + "step": 590 + }, + { + "clip_ratio/high_max": 2.4759768621152034e-05, + "clip_ratio/high_mean": 6.1899421552880085e-06, + "clip_ratio/low_mean": 3.2254738812298456e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8444680967586464e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15600.0, + "completions/mean_length": 7255.453125, + "completions/mean_terminated_length": 6646.8837890625, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "entropy": 0.8241118341684341, + "epoch": 0.5436982520699172, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003160425927489996, + "learning_rate": 1e-05, + "loss": 0.0821, + "num_tokens": 520680707.0, + "reward": 0.3359375, + "reward_std": 0.2461756467819214, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000334978103638, + "sampling/importance_sampling_ratio/min": 0.0009408618789166212, + "sampling/sampling_logp_difference/max": 6.968714237213135, + "sampling/sampling_logp_difference/mean": 0.019255205988883972, + "step": 591 + }, + { + "clip_ratio/high_max": 7.459808557541692e-06, + "clip_ratio/high_mean": 1.864952139385423e-06, + "clip_ratio/low_mean": 3.9836502310208743e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.170145416537707e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 7819.96875, + "completions/mean_terminated_length": 7752.53564453125, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 1.1218742430210114, + "epoch": 0.5446182152713891, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00411194609478116, + "learning_rate": 1e-05, + "loss": 0.0267, + "num_tokens": 521703303.0, + "reward": 0.2265625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999041557312012, + "sampling/importance_sampling_ratio/min": 0.0003571478300727904, + "sampling/sampling_logp_difference/max": 7.937360763549805, + "sampling/sampling_logp_difference/mean": 0.022727783769369125, + "step": 592 + }, + { + "clip_ratio/high_max": 1.8858649582398357e-05, + "clip_ratio/high_mean": 4.714662395599589e-06, + "clip_ratio/low_mean": 3.738353416338214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.2098196558981726e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16117.0, + "completions/mean_length": 6322.8671875, + "completions/mean_terminated_length": 6163.1669921875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 0.8323960080742836, + "epoch": 0.5455381784728611, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022753921803086996, + "learning_rate": 1e-05, + "loss": 0.0339, + "num_tokens": 522531422.0, + "reward": 0.4140625, + "reward_std": 0.20753081142902374, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998952150344849, + "sampling/importance_sampling_ratio/min": 5.422274170996388e-06, + "sampling/sampling_logp_difference/max": 12.124995231628418, + "sampling/sampling_logp_difference/mean": 0.01893780007958412, + "step": 593 + }, + { + "clip_ratio/high_max": 3.977598225901602e-06, + "clip_ratio/high_mean": 9.943995564754005e-07, + "clip_ratio/low_mean": 1.1187657776190463e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.2182057332665863e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16055.0, + "completions/mean_length": 7054.0625, + "completions/mean_terminated_length": 6905.96875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.866028867661953, + "epoch": 0.546458141674333, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.004338000901043415, + "learning_rate": 1e-05, + "loss": -0.0134, + "num_tokens": 523453262.0, + "reward": 0.328125, + "reward_std": 0.13204573094844818, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998721480369568, + "sampling/importance_sampling_ratio/min": 7.97068714746274e-05, + "sampling/sampling_logp_difference/max": 9.437154769897461, + "sampling/sampling_logp_difference/mean": 0.01982954889535904, + "step": 594 + }, + { + "clip_ratio/high_max": 1.5038514220577781e-05, + "clip_ratio/high_mean": 3.7596285551444453e-06, + "clip_ratio/low_mean": 3.533169467573316e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9091323742468376e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 7539.0703125, + "completions/mean_terminated_length": 7027.3798828125, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.8601142391562462, + "epoch": 0.547378104875805, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003401415189728141, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 524436831.0, + "reward": 0.4140625, + "reward_std": 0.2511882185935974, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999969482421875, + "sampling/importance_sampling_ratio/min": 2.0915547793265432e-05, + "sampling/sampling_logp_difference/max": 10.775017738342285, + "sampling/sampling_logp_difference/mean": 0.019884679466485977, + "step": 595 + }, + { + "clip_ratio/high_max": 2.9679867111553904e-05, + "clip_ratio/high_mean": 8.187421713046206e-06, + "clip_ratio/low_mean": 5.44505830930575e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.263800514716422e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16343.0, + "completions/mean_length": 7137.96875, + "completions/mean_terminated_length": 6762.11376953125, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.7909424379467964, + "epoch": 0.5482980680772769, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002879115054383874, + "learning_rate": 1e-05, + "loss": 0.0549, + "num_tokens": 525368091.0, + "reward": 0.546875, + "reward_std": 0.27062684297561646, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000025033950806, + "sampling/importance_sampling_ratio/min": 0.0004618439415935427, + "sampling/sampling_logp_difference/max": 7.680283546447754, + "sampling/sampling_logp_difference/mean": 0.01847894862294197, + "step": 596 + }, + { + "clip_ratio/high_max": 5.765416517533595e-06, + "clip_ratio/high_mean": 1.4413541293833987e-06, + "clip_ratio/low_mean": 3.1269102407804894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2710456423501455e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5486.3671875, + "completions/mean_terminated_length": 5224.82421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9588652476668358, + "epoch": 0.5492180312787488, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004545152187347412, + "learning_rate": 1e-05, + "loss": 0.0549, + "num_tokens": 526095378.0, + "reward": 0.359375, + "reward_std": 0.33508801460266113, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998891353607178, + "sampling/importance_sampling_ratio/min": 6.280510569922626e-05, + "sampling/sampling_logp_difference/max": 9.675474166870117, + "sampling/sampling_logp_difference/mean": 0.02017204463481903, + "step": 597 + }, + { + "clip_ratio/high_max": 1.519483475931338e-05, + "clip_ratio/high_mean": 4.732241109195456e-06, + "clip_ratio/low_mean": 4.477498589494644e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.950722734520241e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16169.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 6636.0078125, + "completions/mean_terminated_length": 6636.0078125, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.9497648254036903, + "epoch": 0.5501379944802208, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004040954168885946, + "learning_rate": 1e-05, + "loss": 0.0477, + "num_tokens": 526969459.0, + "reward": 0.3515625, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474287033081, + "sampling/importance_sampling_ratio/min": 2.2340275407373156e-08, + "sampling/sampling_logp_difference/max": 17.61687469482422, + "sampling/sampling_logp_difference/mean": 0.02086419239640236, + "step": 598 + }, + { + "clip_ratio/high_max": 1.5785165032866644e-05, + "clip_ratio/high_mean": 3.946291258216661e-06, + "clip_ratio/low_mean": 4.7215530003086315e-05, + "clip_ratio/low_min": 5.274039267533226e-06, + "clip_ratio/region_mean": 5.116182205711084e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15820.0, + "completions/mean_length": 6462.953125, + "completions/mean_terminated_length": 6142.9189453125, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "entropy": 0.9401230812072754, + "epoch": 0.5510579576816927, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004678349941968918, + "learning_rate": 1e-05, + "loss": 0.1854, + "num_tokens": 527822197.0, + "reward": 0.5234375, + "reward_std": 0.3345640003681183, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997877478599548, + "sampling/importance_sampling_ratio/min": 2.8560234568431042e-05, + "sampling/sampling_logp_difference/max": 10.463495254516602, + "sampling/sampling_logp_difference/mean": 0.019832316786050797, + "step": 599 + }, + { + "clip_ratio/high_max": 4.1415414671064354e-06, + "clip_ratio/high_mean": 1.0353853667766089e-06, + "clip_ratio/low_mean": 4.795687004843785e-05, + "clip_ratio/low_min": 7.76807610236574e-06, + "clip_ratio/region_mean": 4.899225518784078e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15170.0, + "completions/mean_length": 7172.1015625, + "completions/mean_terminated_length": 6951.01611328125, + "completions/min_length": 1079.0, + "completions/min_terminated_length": 1079.0, + "entropy": 0.7962061613798141, + "epoch": 0.5519779208831647, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014094997895881534, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 528759458.0, + "reward": 0.3515625, + "reward_std": 0.16834919154644012, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999281167984009, + "sampling/importance_sampling_ratio/min": 0.001331693259999156, + "sampling/sampling_logp_difference/max": 6.621304035186768, + "sampling/sampling_logp_difference/mean": 0.018519852310419083, + "step": 600 + }, + { + "clip_ratio/high_max": 7.3846517807396594e-06, + "clip_ratio/high_mean": 3.018199095095042e-06, + "clip_ratio/low_mean": 5.2064756346226204e-05, + "clip_ratio/low_min": 5.341652013157727e-06, + "clip_ratio/region_mean": 5.5082955441321246e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16195.0, + "completions/mean_length": 6612.6484375, + "completions/mean_terminated_length": 6378.13623046875, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.8218385726213455, + "epoch": 0.5528978840846366, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038943374529480934, + "learning_rate": 1e-05, + "loss": 0.0561, + "num_tokens": 529626893.0, + "reward": 0.390625, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 0.0024450027849525213, + "sampling/sampling_logp_difference/max": 6.01370906829834, + "sampling/sampling_logp_difference/mean": 0.018441151827573776, + "step": 601 + }, + { + "clip_ratio/high_max": 8.209965471905889e-06, + "clip_ratio/high_mean": 2.0524913679764722e-06, + "clip_ratio/low_mean": 4.8717710285473004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.077020244925734e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15898.0, + "completions/mean_length": 6574.9140625, + "completions/mean_terminated_length": 6419.21484375, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.9268836230039597, + "epoch": 0.5538178472861086, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027088895440101624, + "learning_rate": 1e-05, + "loss": 0.0577, + "num_tokens": 530486578.0, + "reward": 0.4453125, + "reward_std": 0.26143792271614075, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000026822090149, + "sampling/importance_sampling_ratio/min": 1.1735714906535577e-05, + "sampling/sampling_logp_difference/max": 11.352873802185059, + "sampling/sampling_logp_difference/mean": 0.020115964114665985, + "step": 602 + }, + { + "clip_ratio/high_max": 5.24967435922008e-06, + "clip_ratio/high_mean": 1.31241858980502e-06, + "clip_ratio/low_mean": 1.3909025255998131e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5221443845803151e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14361.0, + "completions/mean_length": 6209.1953125, + "completions/mean_terminated_length": 6129.07861328125, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9574517607688904, + "epoch": 0.5547378104875805, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.002628365531563759, + "learning_rate": 1e-05, + "loss": 0.0461, + "num_tokens": 531303083.0, + "reward": 0.3671875, + "reward_std": 0.13098490238189697, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998608827590942, + "sampling/importance_sampling_ratio/min": 2.862734254449606e-05, + "sampling/sampling_logp_difference/max": 10.461148262023926, + "sampling/sampling_logp_difference/mean": 0.019658785313367844, + "step": 603 + }, + { + "clip_ratio/high_max": 1.9014597455679905e-05, + "clip_ratio/high_mean": 4.753649363919976e-06, + "clip_ratio/low_mean": 4.9158792762682424e-05, + "clip_ratio/low_min": 4.514427928370424e-06, + "clip_ratio/region_mean": 5.39124412171077e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13873.0, + "completions/mean_length": 7079.1875, + "completions/mean_terminated_length": 6855.87255859375, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 0.853938102722168, + "epoch": 0.5556577736890524, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004664157051593065, + "learning_rate": 1e-05, + "loss": 0.0285, + "num_tokens": 532228227.0, + "reward": 0.2734375, + "reward_std": 0.30327796936035156, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999879598617554, + "sampling/importance_sampling_ratio/min": 5.377535785555665e-07, + "sampling/sampling_logp_difference/max": 14.43586540222168, + "sampling/sampling_logp_difference/mean": 0.018260695040225983, + "step": 604 + }, + { + "clip_ratio/high_max": 3.025483556484687e-05, + "clip_ratio/high_mean": 7.563708891211718e-06, + "clip_ratio/low_mean": 2.1738228269896354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9301936820047558e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15094.0, + "completions/max_terminated_length": 15094.0, + "completions/mean_length": 6071.5390625, + "completions/mean_terminated_length": 6071.5390625, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "entropy": 0.980722151696682, + "epoch": 0.5565777368905244, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004579839296638966, + "learning_rate": 1e-05, + "loss": 0.0168, + "num_tokens": 533024264.0, + "reward": 0.4765625, + "reward_std": 0.30327799916267395, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999982476234436, + "sampling/importance_sampling_ratio/min": 0.0003390153287909925, + "sampling/sampling_logp_difference/max": 7.989465236663818, + "sampling/sampling_logp_difference/mean": 0.01974770799279213, + "step": 605 + }, + { + "clip_ratio/high_max": 1.3344870239961892e-05, + "clip_ratio/high_mean": 4.773990667672479e-06, + "clip_ratio/low_mean": 5.142044130934664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6194432318079635e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 7352.484375, + "completions/mean_terminated_length": 7209.12744140625, + "completions/min_length": 1310.0, + "completions/min_terminated_length": 1310.0, + "entropy": 0.7858814746141434, + "epoch": 0.5574977000919963, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002537919208407402, + "learning_rate": 1e-05, + "loss": 0.0576, + "num_tokens": 533985318.0, + "reward": 0.3125, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999037981033325, + "sampling/importance_sampling_ratio/min": 0.0017827138071879745, + "sampling/sampling_logp_difference/max": 6.329618453979492, + "sampling/sampling_logp_difference/mean": 0.018647275865077972, + "step": 606 + }, + { + "clip_ratio/high_max": 2.345925531699322e-05, + "clip_ratio/high_mean": 7.0977013137962786e-06, + "clip_ratio/low_mean": 4.466222731025482e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.175992941985896e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16082.0, + "completions/mean_length": 7095.1875, + "completions/mean_terminated_length": 6947.74658203125, + "completions/min_length": 1073.0, + "completions/min_terminated_length": 1073.0, + "entropy": 0.6846291124820709, + "epoch": 0.5584176632934683, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037982286885380745, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 534912558.0, + "reward": 0.53125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147057533264, + "sampling/importance_sampling_ratio/min": 8.089523180387914e-05, + "sampling/sampling_logp_difference/max": 9.422355651855469, + "sampling/sampling_logp_difference/mean": 0.01693977229297161, + "step": 607 + }, + { + "clip_ratio/high_max": 5.167851668375079e-06, + "clip_ratio/high_mean": 1.2919629170937696e-06, + "clip_ratio/low_mean": 6.557838094067847e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.687034363039857e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6038.1953125, + "completions/mean_terminated_length": 5873.9765625, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "entropy": 0.8637901693582535, + "epoch": 0.5593376264949402, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030545955523848534, + "learning_rate": 1e-05, + "loss": 0.0716, + "num_tokens": 535707127.0, + "reward": 0.5078125, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999387264251709, + "sampling/importance_sampling_ratio/min": 0.00017956242663785815, + "sampling/sampling_logp_difference/max": 8.624987602233887, + "sampling/sampling_logp_difference/mean": 0.018705151975154877, + "step": 608 + }, + { + "clip_ratio/high_max": 1.7691760149318725e-05, + "clip_ratio/high_mean": 5.544901910070621e-06, + "clip_ratio/low_mean": 5.012885230826214e-05, + "clip_ratio/low_min": 3.5653165468829684e-06, + "clip_ratio/region_mean": 5.5673754559393274e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14906.0, + "completions/mean_length": 6978.0078125, + "completions/mean_terminated_length": 6828.70654296875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.7931060045957565, + "epoch": 0.5602575896964122, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002951717935502529, + "learning_rate": 1e-05, + "loss": 0.0698, + "num_tokens": 536618376.0, + "reward": 0.46875, + "reward_std": 0.3527044355869293, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 3.865327380481176e-05, + "sampling/sampling_logp_difference/max": 10.160879135131836, + "sampling/sampling_logp_difference/mean": 0.018486514687538147, + "step": 609 + }, + { + "clip_ratio/high_max": 2.1591150925814873e-05, + "clip_ratio/high_mean": 5.397787731453718e-06, + "clip_ratio/low_mean": 6.101864732954709e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.6416435629435e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15329.0, + "completions/max_terminated_length": 15329.0, + "completions/mean_length": 6810.15625, + "completions/mean_terminated_length": 6810.15625, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.8957240954041481, + "epoch": 0.5611775528978841, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019385438645258546, + "learning_rate": 1e-05, + "loss": 0.0973, + "num_tokens": 537513876.0, + "reward": 0.328125, + "reward_std": 0.28011518716812134, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000025749206543, + "sampling/importance_sampling_ratio/min": 4.845474904868752e-05, + "sampling/sampling_logp_difference/max": 9.934880256652832, + "sampling/sampling_logp_difference/mean": 0.02021351456642151, + "step": 610 + }, + { + "clip_ratio/high_max": 1.4817902865615906e-05, + "clip_ratio/high_mean": 5.914362077419355e-06, + "clip_ratio/low_mean": 1.2616926369446446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8531288333178964e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16065.0, + "completions/mean_length": 6940.4140625, + "completions/mean_terminated_length": 6713.7685546875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.8646975234150887, + "epoch": 0.562097516099356, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001886329147964716, + "learning_rate": 1e-05, + "loss": 0.0319, + "num_tokens": 538419265.0, + "reward": 0.375, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000052452087402, + "sampling/importance_sampling_ratio/min": 6.893687327647058e-07, + "sampling/sampling_logp_difference/max": 14.18748950958252, + "sampling/sampling_logp_difference/mean": 0.019072774797677994, + "step": 611 + }, + { + "clip_ratio/high_max": 6.3681300161988474e-06, + "clip_ratio/high_mean": 1.5920325040497119e-06, + "clip_ratio/low_mean": 3.254086982451554e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4132902555938927e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15960.0, + "completions/mean_length": 7508.796875, + "completions/mean_terminated_length": 6995.35498046875, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.7723299860954285, + "epoch": 0.563017479300828, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002031022449955344, + "learning_rate": 1e-05, + "loss": 0.0335, + "num_tokens": 539399127.0, + "reward": 0.4296875, + "reward_std": 0.2301519513130188, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.0056421491317451, + "sampling/sampling_logp_difference/max": 5.177490234375, + "sampling/sampling_logp_difference/mean": 0.01832709088921547, + "step": 612 + }, + { + "clip_ratio/high_max": 1.5848977909627138e-05, + "clip_ratio/high_mean": 3.9622444774067844e-06, + "clip_ratio/low_mean": 2.6742804038804024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.070504851621081e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15816.0, + "completions/mean_length": 6019.6484375, + "completions/mean_terminated_length": 5938.03955078125, + "completions/min_length": 1020.0, + "completions/min_terminated_length": 1020.0, + "entropy": 0.7425512671470642, + "epoch": 0.5639374425022999, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003653773572295904, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 540189602.0, + "reward": 0.53125, + "reward_std": 0.26143303513526917, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999122619628906, + "sampling/importance_sampling_ratio/min": 0.005288486368954182, + "sampling/sampling_logp_difference/max": 5.242223262786865, + "sampling/sampling_logp_difference/mean": 0.017161473631858826, + "step": 613 + }, + { + "clip_ratio/high_max": 1.1017190900020069e-05, + "clip_ratio/high_mean": 2.754297725005017e-06, + "clip_ratio/low_mean": 3.428678644468164e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7041084169686656e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15861.0, + "completions/mean_length": 7155.6953125, + "completions/mean_terminated_length": 6621.826171875, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "entropy": 0.9789249897003174, + "epoch": 0.5648574057037719, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003739065257832408, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 541125587.0, + "reward": 0.265625, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999271631240845, + "sampling/importance_sampling_ratio/min": 9.236609002982732e-06, + "sampling/sampling_logp_difference/max": 11.59233570098877, + "sampling/sampling_logp_difference/mean": 0.02008877694606781, + "step": 614 + }, + { + "clip_ratio/high_max": 5.6091539590852335e-06, + "clip_ratio/high_mean": 2.4549021873099264e-06, + "clip_ratio/low_mean": 4.249646542575647e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4951367613066395e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13553.0, + "completions/mean_length": 8027.359375, + "completions/mean_terminated_length": 7470.25048828125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.9153474718332291, + "epoch": 0.5657773689052438, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0020656392443925142, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 542173801.0, + "reward": 0.2578125, + "reward_std": 0.22225633263587952, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999947190284729, + "sampling/importance_sampling_ratio/min": 0.00029620854184031487, + "sampling/sampling_logp_difference/max": 8.124446868896484, + "sampling/sampling_logp_difference/mean": 0.021495234221220016, + "step": 615 + }, + { + "clip_ratio/high_max": 1.7302586002188036e-05, + "clip_ratio/high_mean": 4.325646500547009e-06, + "clip_ratio/low_mean": 5.2193488272678223e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.6519134659538395e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 6115.3828125, + "completions/mean_terminated_length": 5952.38916015625, + "completions/min_length": 1158.0, + "completions/min_terminated_length": 1158.0, + "entropy": 0.751783661544323, + "epoch": 0.5666973321067157, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00824788399040699, + "learning_rate": 1e-05, + "loss": 0.0648, + "num_tokens": 542977266.0, + "reward": 0.4609375, + "reward_std": 0.30616888403892517, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999478459358215, + "sampling/importance_sampling_ratio/min": 0.0013296925462782383, + "sampling/sampling_logp_difference/max": 6.622807502746582, + "sampling/sampling_logp_difference/mean": 0.017732972279191017, + "step": 616 + }, + { + "clip_ratio/high_max": 2.872588265745435e-05, + "clip_ratio/high_mean": 8.185486876755022e-06, + "clip_ratio/low_mean": 5.301810256241879e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.120358921180014e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15688.0, + "completions/mean_length": 7431.3203125, + "completions/mean_terminated_length": 7142.52392578125, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.9122852608561516, + "epoch": 0.5676172953081877, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005189655348658562, + "learning_rate": 1e-05, + "loss": 0.0613, + "num_tokens": 543947515.0, + "reward": 0.484375, + "reward_std": 0.21595832705497742, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999845623970032, + "sampling/importance_sampling_ratio/min": 0.00017607140762265772, + "sampling/sampling_logp_difference/max": 8.644620895385742, + "sampling/sampling_logp_difference/mean": 0.02111673541367054, + "step": 617 + }, + { + "clip_ratio/high_max": 3.984698651038343e-06, + "clip_ratio/high_mean": 9.961746627595858e-07, + "clip_ratio/low_mean": 3.414959587644262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.514577088026272e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16378.0, + "completions/mean_length": 5700.5546875, + "completions/mean_terminated_length": 5530.9765625, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.8961661159992218, + "epoch": 0.5685372585096596, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004707770887762308, + "learning_rate": 1e-05, + "loss": 0.0773, + "num_tokens": 544694826.0, + "reward": 0.4921875, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998490214347839, + "sampling/importance_sampling_ratio/min": 5.211461817644647e-10, + "sampling/sampling_logp_difference/max": 21.374990463256836, + "sampling/sampling_logp_difference/mean": 0.018697837367653847, + "step": 618 + }, + { + "clip_ratio/high_max": 1.1809721399913542e-05, + "clip_ratio/high_mean": 2.9524303499783855e-06, + "clip_ratio/low_mean": 5.229935004535946e-05, + "clip_ratio/low_min": 4.098226327187149e-06, + "clip_ratio/region_mean": 5.525178062271152e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12422.0, + "completions/max_terminated_length": 12422.0, + "completions/mean_length": 4201.6796875, + "completions/mean_terminated_length": 4201.6796875, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "entropy": 0.7066933363676071, + "epoch": 0.5694572217111316, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.00980924628674984, + "learning_rate": 1e-05, + "loss": 0.0492, + "num_tokens": 545255377.0, + "reward": 0.5625, + "reward_std": 0.38664889335632324, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000074028968811, + "sampling/importance_sampling_ratio/min": 7.827866647858173e-05, + "sampling/sampling_logp_difference/max": 9.455235481262207, + "sampling/sampling_logp_difference/mean": 0.016301468014717102, + "step": 619 + }, + { + "clip_ratio/high_max": 6.093102456361521e-06, + "clip_ratio/high_mean": 1.5232756140903803e-06, + "clip_ratio/low_mean": 1.853809601470857e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0061371856172627e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13234.0, + "completions/mean_length": 5782.2578125, + "completions/mean_terminated_length": 5613.9765625, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.846621498465538, + "epoch": 0.5703771849126035, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005619424395263195, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 546013882.0, + "reward": 0.46875, + "reward_std": 0.2472364753484726, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000319480895996, + "sampling/importance_sampling_ratio/min": 9.447568299947307e-05, + "sampling/sampling_logp_difference/max": 9.267168045043945, + "sampling/sampling_logp_difference/mean": 0.018704919144511223, + "step": 620 + }, + { + "clip_ratio/high_max": 1.6747734207456233e-05, + "clip_ratio/high_mean": 4.186933551864058e-06, + "clip_ratio/low_mean": 4.008232758678787e-05, + "clip_ratio/low_min": 3.511630438879365e-06, + "clip_ratio/region_mean": 4.426926193445979e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 7191.4921875, + "completions/mean_terminated_length": 7045.57958984375, + "completions/min_length": 1379.0, + "completions/min_terminated_length": 1379.0, + "entropy": 0.7846563309431076, + "epoch": 0.5712971481140754, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0063271005637943745, + "learning_rate": 1e-05, + "loss": 0.0964, + "num_tokens": 546954857.0, + "reward": 0.4296875, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999164342880249, + "sampling/importance_sampling_ratio/min": 0.006330032367259264, + "sampling/sampling_logp_difference/max": 5.062449932098389, + "sampling/sampling_logp_difference/mean": 0.01846012845635414, + "step": 621 + }, + { + "clip_ratio/high_max": 3.451678094279487e-05, + "clip_ratio/high_mean": 1.2486661603361426e-05, + "clip_ratio/low_mean": 5.253966105556174e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.502632390947838e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15529.0, + "completions/max_terminated_length": 15529.0, + "completions/mean_length": 5491.7421875, + "completions/mean_terminated_length": 5491.7421875, + "completions/min_length": 1644.0, + "completions/min_terminated_length": 1644.0, + "entropy": 0.6960643380880356, + "epoch": 0.5722171113155474, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005836677737534046, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 547676024.0, + "reward": 0.5625, + "reward_std": 0.43213340640068054, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999930739402771, + "sampling/importance_sampling_ratio/min": 0.00043176248436793685, + "sampling/sampling_logp_difference/max": 7.7476348876953125, + "sampling/sampling_logp_difference/mean": 0.016565188765525818, + "step": 622 + }, + { + "clip_ratio/high_max": 4.318982973927632e-06, + "clip_ratio/high_mean": 1.079745743481908e-06, + "clip_ratio/low_mean": 3.0399249226320535e-05, + "clip_ratio/low_min": 5.838393462909153e-06, + "clip_ratio/region_mean": 3.147899496980244e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16179.0, + "completions/mean_length": 6993.125, + "completions/mean_terminated_length": 6844.06396484375, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "entropy": 0.8031502217054367, + "epoch": 0.5731370745170193, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00226933928206563, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 548590080.0, + "reward": 0.3984375, + "reward_std": 0.19332444667816162, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 1.1417677114877733e-06, + "sampling/sampling_logp_difference/max": 13.68293285369873, + "sampling/sampling_logp_difference/mean": 0.01880657486617565, + "step": 623 + }, + { + "clip_ratio/high_max": 8.404208529100288e-06, + "clip_ratio/high_mean": 2.101052132275072e-06, + "clip_ratio/low_mean": 4.231840989632474e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.441946202859981e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15278.0, + "completions/max_terminated_length": 15278.0, + "completions/mean_length": 5602.8359375, + "completions/mean_terminated_length": 5602.8359375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.8287182524800301, + "epoch": 0.5740570377184913, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.005067484453320503, + "learning_rate": 1e-05, + "loss": 0.0394, + "num_tokens": 549327251.0, + "reward": 0.5, + "reward_std": 0.35218530893325806, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701380729675, + "sampling/importance_sampling_ratio/min": 0.0036069792695343494, + "sampling/sampling_logp_difference/max": 5.624884605407715, + "sampling/sampling_logp_difference/mean": 0.018545404076576233, + "step": 624 + }, + { + "clip_ratio/high_max": 7.49742275729659e-06, + "clip_ratio/high_mean": 1.8743556893241475e-06, + "clip_ratio/low_mean": 4.6288066641864134e-05, + "clip_ratio/low_min": 5.32640206074575e-06, + "clip_ratio/region_mean": 4.816242244487512e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15901.0, + "completions/mean_length": 6747.0234375, + "completions/mean_terminated_length": 6671.1416015625, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "entropy": 0.8722762316465378, + "epoch": 0.5749770009199632, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023132911883294582, + "learning_rate": 1e-05, + "loss": 0.0064, + "num_tokens": 550208750.0, + "reward": 0.390625, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999475479125977, + "sampling/importance_sampling_ratio/min": 0.003727440955117345, + "sampling/sampling_logp_difference/max": 5.592033386230469, + "sampling/sampling_logp_difference/mean": 0.019216621294617653, + "step": 625 + }, + { + "clip_ratio/high_max": 7.693567567912396e-06, + "clip_ratio/high_mean": 1.923391891978099e-06, + "clip_ratio/low_mean": 6.517495285152108e-05, + "clip_ratio/low_min": 1.1217302017030306e-05, + "clip_ratio/region_mean": 6.709834497087286e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16027.0, + "completions/max_terminated_length": 16027.0, + "completions/mean_length": 6983.40625, + "completions/mean_terminated_length": 6983.40625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.8781512826681137, + "epoch": 0.5758969641214351, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0036700034979730844, + "learning_rate": 1e-05, + "loss": 0.0905, + "num_tokens": 551123002.0, + "reward": 0.328125, + "reward_std": 0.2419992983341217, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999868273735046, + "sampling/importance_sampling_ratio/min": 5.0360464229015633e-05, + "sampling/sampling_logp_difference/max": 9.8963041305542, + "sampling/sampling_logp_difference/mean": 0.019318291917443275, + "step": 626 + }, + { + "clip_ratio/high_max": 5.098295332572889e-06, + "clip_ratio/high_mean": 1.2745738331432221e-06, + "clip_ratio/low_mean": 5.9073974398415885e-05, + "clip_ratio/low_min": 6.781316187698394e-06, + "clip_ratio/region_mean": 6.034854845893278e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16201.0, + "completions/mean_length": 7143.671875, + "completions/mean_terminated_length": 6689.22900390625, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.7715872526168823, + "epoch": 0.5768169273229071, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0036717690527439117, + "learning_rate": 1e-05, + "loss": 0.0268, + "num_tokens": 552055472.0, + "reward": 0.3671875, + "reward_std": 0.2212003767490387, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998798966407776, + "sampling/importance_sampling_ratio/min": 0.00012340980174485594, + "sampling/sampling_logp_difference/max": 9.0, + "sampling/sampling_logp_difference/mean": 0.018518533557653427, + "step": 627 + }, + { + "clip_ratio/high_max": 1.778747127900715e-05, + "clip_ratio/high_mean": 4.4468678197517875e-06, + "clip_ratio/low_mean": 2.460010267668622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9046970439594588e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15729.0, + "completions/mean_length": 6558.5859375, + "completions/mean_terminated_length": 6075.36865234375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "entropy": 0.9016438648104668, + "epoch": 0.577736890524379, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019187588477507234, + "learning_rate": 1e-05, + "loss": 0.0494, + "num_tokens": 552914275.0, + "reward": 0.484375, + "reward_std": 0.2041158676147461, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999418258666992, + "sampling/importance_sampling_ratio/min": 0.00011496193474158645, + "sampling/sampling_logp_difference/max": 9.07090950012207, + "sampling/sampling_logp_difference/mean": 0.01948089525103569, + "step": 628 + }, + { + "clip_ratio/high_max": 1.383282506139949e-05, + "clip_ratio/high_mean": 3.4582062653498724e-06, + "clip_ratio/low_mean": 4.3287541757308645e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.674574802265852e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15812.0, + "completions/max_terminated_length": 15812.0, + "completions/mean_length": 6150.2734375, + "completions/mean_terminated_length": 6150.2734375, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.8385711833834648, + "epoch": 0.578656853725851, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003598993644118309, + "learning_rate": 1e-05, + "loss": 0.0907, + "num_tokens": 553719958.0, + "reward": 0.5078125, + "reward_std": 0.3022220730781555, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999948740005493, + "sampling/importance_sampling_ratio/min": 0.000830297009088099, + "sampling/sampling_logp_difference/max": 7.093727111816406, + "sampling/sampling_logp_difference/mean": 0.019557828083634377, + "step": 629 + }, + { + "clip_ratio/high_max": 2.668830120455823e-06, + "clip_ratio/high_mean": 6.672075301139557e-07, + "clip_ratio/low_mean": 1.7461135655594262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8128343185708218e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 8142.46875, + "completions/mean_terminated_length": 7519.16015625, + "completions/min_length": 1828.0, + "completions/min_terminated_length": 1828.0, + "entropy": 0.8508284538984299, + "epoch": 0.5795768169273229, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.002453390508890152, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 554784458.0, + "reward": 0.390625, + "reward_std": 0.1422954648733139, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999715089797974, + "sampling/importance_sampling_ratio/min": 0.0002036939695244655, + "sampling/sampling_logp_difference/max": 8.498891830444336, + "sampling/sampling_logp_difference/mean": 0.019445519894361496, + "step": 630 + }, + { + "clip_ratio/high_max": 1.9002460248884745e-05, + "clip_ratio/high_mean": 4.750615062221186e-06, + "clip_ratio/low_mean": 3.1556500402984966e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630711614732718e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16109.0, + "completions/mean_length": 7665.921875, + "completions/mean_terminated_length": 7384.693359375, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "entropy": 0.7667205557227135, + "epoch": 0.5804967801287948, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027936683036386967, + "learning_rate": 1e-05, + "loss": 0.0245, + "num_tokens": 555783296.0, + "reward": 0.4296875, + "reward_std": 0.24435830116271973, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998488426208496, + "sampling/importance_sampling_ratio/min": 0.0002781523216981441, + "sampling/sampling_logp_difference/max": 8.187341690063477, + "sampling/sampling_logp_difference/mean": 0.01912892609834671, + "step": 631 + }, + { + "clip_ratio/high_max": 1.5569996094200178e-05, + "clip_ratio/high_mean": 3.8924990235500445e-06, + "clip_ratio/low_mean": 3.8605214058407e-05, + "clip_ratio/low_min": 6.2870940382708795e-06, + "clip_ratio/region_mean": 4.249771222930576e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16379.0, + "completions/mean_length": 7266.171875, + "completions/mean_terminated_length": 6972.04833984375, + "completions/min_length": 1117.0, + "completions/min_terminated_length": 1117.0, + "entropy": 0.7114122956991196, + "epoch": 0.5814167433302668, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004213637672364712, + "learning_rate": 1e-05, + "loss": 0.1028, + "num_tokens": 556732942.0, + "reward": 0.5390625, + "reward_std": 0.3135277032852173, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999159574508667, + "sampling/importance_sampling_ratio/min": 1.760348027346481e-06, + "sampling/sampling_logp_difference/max": 13.249999046325684, + "sampling/sampling_logp_difference/mean": 0.01689826510846615, + "step": 632 + }, + { + "clip_ratio/high_max": 2.1737864472015644e-05, + "clip_ratio/high_mean": 5.434466118003911e-06, + "clip_ratio/low_mean": 3.640393322257296e-05, + "clip_ratio/low_min": 3.0146634344419e-06, + "clip_ratio/region_mean": 4.183839985216764e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6532.9921875, + "completions/mean_terminated_length": 6296.568359375, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.7711968123912811, + "epoch": 0.5823367065317387, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004169877618551254, + "learning_rate": 1e-05, + "loss": 0.0406, + "num_tokens": 557589141.0, + "reward": 0.546875, + "reward_std": 0.2675113081932068, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999022483825684, + "sampling/importance_sampling_ratio/min": 4.499705482885474e-06, + "sampling/sampling_logp_difference/max": 12.311498641967773, + "sampling/sampling_logp_difference/mean": 0.018738210201263428, + "step": 633 + }, + { + "clip_ratio/high_max": 6.099523716329713e-06, + "clip_ratio/high_mean": 1.5248809290824283e-06, + "clip_ratio/low_mean": 6.070675681257853e-05, + "clip_ratio/low_min": 5.175126261747209e-06, + "clip_ratio/region_mean": 6.223163745744387e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16337.0, + "completions/mean_length": 7384.3203125, + "completions/mean_terminated_length": 7168.328125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.8054972141981125, + "epoch": 0.5832566697332107, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032470994628965855, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 558557286.0, + "reward": 0.4140625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999680519104004, + "sampling/importance_sampling_ratio/min": 0.00019634375348687172, + "sampling/sampling_logp_difference/max": 8.535643577575684, + "sampling/sampling_logp_difference/mean": 0.019018521532416344, + "step": 634 + }, + { + "clip_ratio/high_max": 4.436853964762122e-05, + "clip_ratio/high_mean": 1.1092134911905305e-05, + "clip_ratio/low_mean": 3.798940008437057e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.908153437099827e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15918.0, + "completions/mean_length": 6131.9453125, + "completions/mean_terminated_length": 6051.22021484375, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "entropy": 0.8365718051791191, + "epoch": 0.5841766329346826, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004848263692110777, + "learning_rate": 1e-05, + "loss": 0.1247, + "num_tokens": 559364639.0, + "reward": 0.5625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000056266784668, + "sampling/importance_sampling_ratio/min": 5.424115443020128e-06, + "sampling/sampling_logp_difference/max": 12.124655723571777, + "sampling/sampling_logp_difference/mean": 0.018360167741775513, + "step": 635 + }, + { + "clip_ratio/high_max": 1.9398633412492927e-05, + "clip_ratio/high_mean": 4.849658353123232e-06, + "clip_ratio/low_mean": 2.7543567512111622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.239322609260853e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15724.0, + "completions/max_terminated_length": 15724.0, + "completions/mean_length": 5746.8828125, + "completions/mean_terminated_length": 5746.8828125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.6247628927230835, + "epoch": 0.5850965961361545, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003403177484869957, + "learning_rate": 1e-05, + "loss": 0.0279, + "num_tokens": 560119248.0, + "reward": 0.5390625, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999486207962036, + "sampling/importance_sampling_ratio/min": 6.475952432083432e-07, + "sampling/sampling_logp_difference/max": 14.25, + "sampling/sampling_logp_difference/mean": 0.015006184577941895, + "step": 636 + }, + { + "clip_ratio/high_max": 2.857848289750109e-05, + "clip_ratio/high_mean": 8.111364707019675e-06, + "clip_ratio/low_mean": 4.927243321617425e-05, + "clip_ratio/low_min": 5.929088274569949e-06, + "clip_ratio/region_mean": 5.738379809372418e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7313.7890625, + "completions/mean_terminated_length": 7096.1044921875, + "completions/min_length": 1068.0, + "completions/min_terminated_length": 1068.0, + "entropy": 0.8606570512056351, + "epoch": 0.5860165593376265, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004058506805449724, + "learning_rate": 1e-05, + "loss": 0.093, + "num_tokens": 561072493.0, + "reward": 0.375, + "reward_std": 0.3079911172389984, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0006621598731726408, + "sampling/sampling_logp_difference/max": 7.320003509521484, + "sampling/sampling_logp_difference/mean": 0.01940958946943283, + "step": 637 + }, + { + "clip_ratio/high_max": 2.7213282010052353e-05, + "clip_ratio/high_mean": 7.758043807370996e-06, + "clip_ratio/low_mean": 4.890350828645751e-05, + "clip_ratio/low_min": 3.968002147303196e-06, + "clip_ratio/region_mean": 5.666155129802064e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16093.0, + "completions/mean_length": 7495.5078125, + "completions/mean_terminated_length": 7425.51953125, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "entropy": 0.8225502669811249, + "epoch": 0.5869365225390984, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002768489997833967, + "learning_rate": 1e-05, + "loss": 0.098, + "num_tokens": 562048734.0, + "reward": 0.3671875, + "reward_std": 0.344813734292984, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 1.4612716768169776e-05, + "sampling/sampling_logp_difference/max": 11.133618354797363, + "sampling/sampling_logp_difference/mean": 0.0189508069306612, + "step": 638 + }, + { + "clip_ratio/high_max": 2.5246594077543705e-05, + "clip_ratio/high_mean": 6.311648519385926e-06, + "clip_ratio/low_mean": 4.9131452101391915e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.544310107552519e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15894.0, + "completions/mean_length": 6856.5703125, + "completions/mean_terminated_length": 6627.912109375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.8542520478367805, + "epoch": 0.5878564857405704, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002966079628095031, + "learning_rate": 1e-05, + "loss": 0.0507, + "num_tokens": 562945623.0, + "reward": 0.40625, + "reward_std": 0.3016803562641144, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998261332511902, + "sampling/importance_sampling_ratio/min": 0.0001795661955839023, + "sampling/sampling_logp_difference/max": 8.624966621398926, + "sampling/sampling_logp_difference/mean": 0.019664689898490906, + "step": 639 + }, + { + "clip_ratio/high_max": 1.2127683930884814e-05, + "clip_ratio/high_mean": 5.316983106240514e-06, + "clip_ratio/low_mean": 4.154238490627904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.685936778514588e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15231.0, + "completions/mean_length": 6463.2421875, + "completions/mean_terminated_length": 6305.77001953125, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.8427078947424889, + "epoch": 0.5887764489420423, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021058651618659496, + "learning_rate": 1e-05, + "loss": 0.0164, + "num_tokens": 563789214.0, + "reward": 0.3046875, + "reward_std": 0.24541424214839935, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998518824577332, + "sampling/importance_sampling_ratio/min": 0.00043074542190879583, + "sampling/sampling_logp_difference/max": 7.749993324279785, + "sampling/sampling_logp_difference/mean": 0.01898353546857834, + "step": 640 + }, + { + "clip_ratio/high_max": 1.2559269862322253e-05, + "clip_ratio/high_mean": 3.1398174655805633e-06, + "clip_ratio/low_mean": 3.146892504446441e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4608742623731814e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15232.0, + "completions/max_terminated_length": 15232.0, + "completions/mean_length": 6140.7734375, + "completions/mean_terminated_length": 6140.7734375, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.8800382614135742, + "epoch": 0.5896964121435143, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005890186410397291, + "learning_rate": 1e-05, + "loss": 0.0816, + "num_tokens": 564596185.0, + "reward": 0.4765625, + "reward_std": 0.23486016690731049, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998830556869507, + "sampling/importance_sampling_ratio/min": 0.000808614946436137, + "sampling/sampling_logp_difference/max": 7.120187759399414, + "sampling/sampling_logp_difference/mean": 0.01930009014904499, + "step": 641 + }, + { + "clip_ratio/high_max": 5.099334885017015e-06, + "clip_ratio/high_mean": 1.2748337212542538e-06, + "clip_ratio/low_mean": 4.3151162458343606e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.442599617959786e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6361.703125, + "completions/mean_terminated_length": 6202.61962890625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.8246701806783676, + "epoch": 0.5906163753449862, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003226465079933405, + "learning_rate": 1e-05, + "loss": -0.0094, + "num_tokens": 565430387.0, + "reward": 0.359375, + "reward_std": 0.2682726979255676, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999127984046936, + "sampling/importance_sampling_ratio/min": 0.004490039311349392, + "sampling/sampling_logp_difference/max": 5.405893802642822, + "sampling/sampling_logp_difference/mean": 0.019014433026313782, + "step": 642 + }, + { + "clip_ratio/high_max": 2.8547008014356834e-05, + "clip_ratio/high_mean": 7.822751001640427e-06, + "clip_ratio/low_mean": 3.808748408573592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.591023491684609e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16283.0, + "completions/max_terminated_length": 16283.0, + "completions/mean_length": 7363.5234375, + "completions/mean_terminated_length": 7363.5234375, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.828450471162796, + "epoch": 0.5915363385464582, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003077681176364422, + "learning_rate": 1e-05, + "loss": 0.0597, + "num_tokens": 566393214.0, + "reward": 0.4453125, + "reward_std": 0.24830512702465057, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 3.297756165920873e-07, + "sampling/sampling_logp_difference/max": 14.924853324890137, + "sampling/sampling_logp_difference/mean": 0.01871068961918354, + "step": 643 + }, + { + "clip_ratio/high_max": 4.856254690821515e-06, + "clip_ratio/high_mean": 1.2140636727053788e-06, + "clip_ratio/low_mean": 1.9775024611590197e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.098908817060874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16016.0, + "completions/mean_length": 6883.8984375, + "completions/mean_terminated_length": 6809.09423828125, + "completions/min_length": 830.0, + "completions/min_terminated_length": 830.0, + "entropy": 0.9114723727107048, + "epoch": 0.5924563017479301, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023631115909665823, + "learning_rate": 1e-05, + "loss": -0.0326, + "num_tokens": 567294697.0, + "reward": 0.3359375, + "reward_std": 0.22567616403102875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999625086784363, + "sampling/importance_sampling_ratio/min": 0.005032482091337442, + "sampling/sampling_logp_difference/max": 5.291841983795166, + "sampling/sampling_logp_difference/mean": 0.02030845358967781, + "step": 644 + }, + { + "clip_ratio/high_max": 4.608634753822116e-06, + "clip_ratio/high_mean": 1.152158688455529e-06, + "clip_ratio/low_mean": 3.9204465110742603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035662391288497e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16376.0, + "completions/mean_length": 6996.9296875, + "completions/mean_terminated_length": 6923.015625, + "completions/min_length": 1477.0, + "completions/min_terminated_length": 1477.0, + "entropy": 0.7864109799265862, + "epoch": 0.593376264949402, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.006442595738917589, + "learning_rate": 1e-05, + "loss": 0.071, + "num_tokens": 568210240.0, + "reward": 0.390625, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999593496322632, + "sampling/importance_sampling_ratio/min": 0.0011364181991666555, + "sampling/sampling_logp_difference/max": 6.779873847961426, + "sampling/sampling_logp_difference/mean": 0.018702290952205658, + "step": 645 + }, + { + "clip_ratio/high_max": 1.442532902728999e-05, + "clip_ratio/high_mean": 5.011521352571435e-06, + "clip_ratio/low_mean": 5.24772226526693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.748874355049338e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16367.0, + "completions/mean_length": 6384.5546875, + "completions/mean_terminated_length": 6305.81884765625, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "entropy": 0.7353173196315765, + "epoch": 0.594296228150874, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004090449772775173, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 569046727.0, + "reward": 0.546875, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999207854270935, + "sampling/importance_sampling_ratio/min": 0.00038435845635831356, + "sampling/sampling_logp_difference/max": 7.8639349937438965, + "sampling/sampling_logp_difference/mean": 0.017125204205513, + "step": 646 + }, + { + "clip_ratio/high_max": 1.2007675650238525e-05, + "clip_ratio/high_mean": 3.0019189125596313e-06, + "clip_ratio/low_mean": 3.2856025427463464e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.585794411264942e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16258.0, + "completions/mean_length": 7074.59375, + "completions/mean_terminated_length": 6696.29248046875, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 0.9198992624878883, + "epoch": 0.5952161913523459, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0030447279568761587, + "learning_rate": 1e-05, + "loss": 0.0076, + "num_tokens": 569975323.0, + "reward": 0.359375, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999834299087524, + "sampling/importance_sampling_ratio/min": 0.024500105530023575, + "sampling/sampling_logp_difference/max": 3.709077835083008, + "sampling/sampling_logp_difference/mean": 0.019303584471344948, + "step": 647 + }, + { + "clip_ratio/high_max": 6.353676781145623e-06, + "clip_ratio/high_mean": 1.5884191952864057e-06, + "clip_ratio/low_mean": 7.121561156964162e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.280403042386752e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16211.0, + "completions/mean_length": 8044.2578125, + "completions/mean_terminated_length": 7181.52587890625, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "entropy": 0.8030193895101547, + "epoch": 0.5961361545538179, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004508152138441801, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 571024900.0, + "reward": 0.3203125, + "reward_std": 0.26698729395866394, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999692440032959, + "sampling/importance_sampling_ratio/min": 3.98563061025925e-05, + "sampling/sampling_logp_difference/max": 10.130229949951172, + "sampling/sampling_logp_difference/mean": 0.018804769963026047, + "step": 648 + }, + { + "clip_ratio/high_max": 6.815517735958565e-06, + "clip_ratio/high_mean": 1.7038794339896413e-06, + "clip_ratio/low_mean": 3.612134810282441e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7825227536814054e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15903.0, + "completions/mean_length": 8451.7578125, + "completions/mean_terminated_length": 7922.94189453125, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 1.008152723312378, + "epoch": 0.5970561177552898, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003926917444914579, + "learning_rate": 1e-05, + "loss": 0.0265, + "num_tokens": 572125141.0, + "reward": 0.203125, + "reward_std": 0.19226360321044922, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999009370803833, + "sampling/importance_sampling_ratio/min": 8.862401301712453e-08, + "sampling/sampling_logp_difference/max": 16.238862991333008, + "sampling/sampling_logp_difference/mean": 0.021555956453084946, + "step": 649 + }, + { + "clip_ratio/high_max": 1.5184358971964684e-05, + "clip_ratio/high_mean": 3.796089742991171e-06, + "clip_ratio/low_mean": 5.86272076361638e-05, + "clip_ratio/low_min": 1.1987166999460896e-05, + "clip_ratio/region_mean": 6.242329754968523e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16120.0, + "completions/mean_length": 7011.8203125, + "completions/mean_terminated_length": 6786.88818359375, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.8761812150478363, + "epoch": 0.5979760809567617, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036475847009569407, + "learning_rate": 1e-05, + "loss": 0.0367, + "num_tokens": 573041934.0, + "reward": 0.3984375, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999783039093018, + "sampling/importance_sampling_ratio/min": 3.535783980623819e-05, + "sampling/sampling_logp_difference/max": 10.249990463256836, + "sampling/sampling_logp_difference/mean": 0.02046291157603264, + "step": 650 + }, + { + "clip_ratio/high_max": 1.0979118769682827e-05, + "clip_ratio/high_mean": 2.744779692420707e-06, + "clip_ratio/low_mean": 4.855269958170538e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.129747910359583e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15510.0, + "completions/mean_length": 7665.7421875, + "completions/mean_terminated_length": 7161.3798828125, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "entropy": 0.7933268994092941, + "epoch": 0.5988960441582337, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038963130209594965, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 574040917.0, + "reward": 0.453125, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545812606812, + "sampling/importance_sampling_ratio/min": 1.5536705859631184e-06, + "sampling/sampling_logp_difference/max": 13.374890327453613, + "sampling/sampling_logp_difference/mean": 0.01943662390112877, + "step": 651 + }, + { + "clip_ratio/high_max": 9.610412234906107e-06, + "clip_ratio/high_mean": 3.893257598974742e-06, + "clip_ratio/low_mean": 2.4625115656817798e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8518373483166215e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16049.0, + "completions/mean_length": 7966.828125, + "completions/mean_terminated_length": 7695.30615234375, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 0.8473240435123444, + "epoch": 0.5998160073597056, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0030520735308527946, + "learning_rate": 1e-05, + "loss": 0.0418, + "num_tokens": 575078695.0, + "reward": 0.2734375, + "reward_std": 0.19332443177700043, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000214576721191, + "sampling/importance_sampling_ratio/min": 0.00038126588333398104, + "sampling/sampling_logp_difference/max": 7.872013568878174, + "sampling/sampling_logp_difference/mean": 0.0197810810059309, + "step": 652 + }, + { + "clip_ratio/high_max": 4.0985580199048854e-05, + "clip_ratio/high_mean": 1.0246395049762214e-05, + "clip_ratio/low_mean": 3.762348410418781e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869878471829e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15116.0, + "completions/max_terminated_length": 15116.0, + "completions/mean_length": 6384.53125, + "completions/mean_terminated_length": 6384.53125, + "completions/min_length": 1045.0, + "completions/min_terminated_length": 1045.0, + "entropy": 0.9130589440464973, + "epoch": 0.6007359705611776, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029330148827284575, + "learning_rate": 1e-05, + "loss": 0.1305, + "num_tokens": 575915163.0, + "reward": 0.484375, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999364614486694, + "sampling/importance_sampling_ratio/min": 0.0001401908230036497, + "sampling/sampling_logp_difference/max": 8.872506141662598, + "sampling/sampling_logp_difference/mean": 0.019899431616067886, + "step": 653 + }, + { + "clip_ratio/high_max": 4.804920081369346e-06, + "clip_ratio/high_mean": 1.2012300203423365e-06, + "clip_ratio/low_mean": 4.3348386952857254e-05, + "clip_ratio/low_min": 3.435481630731374e-06, + "clip_ratio/region_mean": 4.454961697319959e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14716.0, + "completions/mean_length": 7484.140625, + "completions/mean_terminated_length": 7414.06298828125, + "completions/min_length": 745.0, + "completions/min_terminated_length": 745.0, + "entropy": 0.8762720301747322, + "epoch": 0.6016559337626495, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037648119032382965, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 576895261.0, + "reward": 0.3125, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999864101409912, + "sampling/importance_sampling_ratio/min": 0.0002691639238037169, + "sampling/sampling_logp_difference/max": 8.220190048217773, + "sampling/sampling_logp_difference/mean": 0.020455794408917427, + "step": 654 + }, + { + "clip_ratio/high_max": 2.329104518139502e-05, + "clip_ratio/high_mean": 5.822761295348755e-06, + "clip_ratio/low_mean": 5.7342298759976984e-05, + "clip_ratio/low_min": 1.5017260921013076e-05, + "clip_ratio/region_mean": 6.316505982795206e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15614.0, + "completions/mean_length": 7483.8671875, + "completions/mean_terminated_length": 7196.76611328125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.8481424525380135, + "epoch": 0.6025758969641214, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022230292670428753, + "learning_rate": 1e-05, + "loss": 0.0874, + "num_tokens": 577874916.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000191926956177, + "sampling/importance_sampling_ratio/min": 0.002037918195128441, + "sampling/sampling_logp_difference/max": 6.195826530456543, + "sampling/sampling_logp_difference/mean": 0.019235530868172646, + "step": 655 + }, + { + "clip_ratio/high_max": 8.201095170079498e-06, + "clip_ratio/high_mean": 2.0502737925198744e-06, + "clip_ratio/low_mean": 3.113216860128887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.318244205274823e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15621.0, + "completions/mean_length": 6618.34375, + "completions/mean_terminated_length": 6541.44873046875, + "completions/min_length": 563.0, + "completions/min_terminated_length": 563.0, + "entropy": 0.8699518665671349, + "epoch": 0.6034958601655934, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003690029727295041, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 578741608.0, + "reward": 0.5390625, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 2.0027882783324458e-05, + "sampling/sampling_logp_difference/max": 10.818385124206543, + "sampling/sampling_logp_difference/mean": 0.019522596150636673, + "step": 656 + }, + { + "clip_ratio/high_max": 4.162365712545579e-06, + "clip_ratio/high_mean": 1.0405914281363948e-06, + "clip_ratio/low_mean": 5.6235591728182044e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7276183270005276e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16165.0, + "completions/mean_length": 6699.6953125, + "completions/mean_terminated_length": 6223.41748046875, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.7825306504964828, + "epoch": 0.6044158233670653, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004026883281767368, + "learning_rate": 1e-05, + "loss": 0.0846, + "num_tokens": 579617377.0, + "reward": 0.4921875, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997950792312622, + "sampling/importance_sampling_ratio/min": 4.181192991836724e-07, + "sampling/sampling_logp_difference/max": 14.687499046325684, + "sampling/sampling_logp_difference/mean": 0.018191896378993988, + "step": 657 + }, + { + "clip_ratio/high_max": 2.1518610083148815e-05, + "clip_ratio/high_mean": 5.379652520787204e-06, + "clip_ratio/low_mean": 3.858270270029607e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.396235544845695e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15406.0, + "completions/max_terminated_length": 15406.0, + "completions/mean_length": 5984.875, + "completions/mean_terminated_length": 5984.875, + "completions/min_length": 1404.0, + "completions/min_terminated_length": 1404.0, + "entropy": 0.8239431977272034, + "epoch": 0.6053357865685373, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004194674547761679, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 580402633.0, + "reward": 0.484375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999486804008484, + "sampling/importance_sampling_ratio/min": 0.003183862892910838, + "sampling/sampling_logp_difference/max": 5.749660015106201, + "sampling/sampling_logp_difference/mean": 0.019084136933088303, + "step": 658 + }, + { + "clip_ratio/high_max": 2.6722831307779416e-05, + "clip_ratio/high_mean": 6.680707826944854e-06, + "clip_ratio/low_mean": 5.0344978262728546e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.702568614651682e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15690.0, + "completions/mean_length": 5950.5703125, + "completions/mean_terminated_length": 5784.96044921875, + "completions/min_length": 1140.0, + "completions/min_terminated_length": 1140.0, + "entropy": 0.8884857445955276, + "epoch": 0.6062557497700092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005016419570893049, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 581187586.0, + "reward": 0.46875, + "reward_std": 0.2306838035583496, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999349117279053, + "sampling/importance_sampling_ratio/min": 0.002998600946739316, + "sampling/sampling_logp_difference/max": 5.809609413146973, + "sampling/sampling_logp_difference/mean": 0.01908070594072342, + "step": 659 + }, + { + "clip_ratio/high_max": 8.678353879076894e-06, + "clip_ratio/high_mean": 2.1695884697692236e-06, + "clip_ratio/low_mean": 2.6390790822006238e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8560379291775462e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16101.0, + "completions/mean_length": 8029.7734375, + "completions/mean_terminated_length": 7690.17041015625, + "completions/min_length": 1584.0, + "completions/min_terminated_length": 1584.0, + "entropy": 0.858074463903904, + "epoch": 0.6071757129714811, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0035609283950179815, + "learning_rate": 1e-05, + "loss": 0.0718, + "num_tokens": 582236557.0, + "reward": 0.3828125, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999931275844574, + "sampling/importance_sampling_ratio/min": 0.005219157785177231, + "sampling/sampling_logp_difference/max": 5.2554192543029785, + "sampling/sampling_logp_difference/mean": 0.01982714608311653, + "step": 660 + }, + { + "clip_ratio/high_max": 2.362454961257754e-05, + "clip_ratio/high_mean": 7.522766622969357e-06, + "clip_ratio/low_mean": 3.278200858858327e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.030477487049211e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16063.0, + "completions/mean_length": 6958.4921875, + "completions/mean_terminated_length": 6494.9423828125, + "completions/min_length": 904.0, + "completions/min_terminated_length": 904.0, + "entropy": 0.7957572638988495, + "epoch": 0.6080956761729531, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005020176526159048, + "learning_rate": 1e-05, + "loss": 0.0505, + "num_tokens": 583150740.0, + "reward": 0.328125, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999988853931427, + "sampling/importance_sampling_ratio/min": 0.022197909653186798, + "sampling/sampling_logp_difference/max": 3.8077571392059326, + "sampling/sampling_logp_difference/mean": 0.018450919538736343, + "step": 661 + }, + { + "clip_ratio/high_max": 9.535187928122468e-06, + "clip_ratio/high_mean": 2.383796982030617e-06, + "clip_ratio/low_mean": 4.201903630018933e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.440283305484627e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15856.0, + "completions/max_terminated_length": 15856.0, + "completions/mean_length": 6810.234375, + "completions/mean_terminated_length": 6810.234375, + "completions/min_length": 1105.0, + "completions/min_terminated_length": 1105.0, + "entropy": 0.7868659943342209, + "epoch": 0.609015639374425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005002971272915602, + "learning_rate": 1e-05, + "loss": 0.0826, + "num_tokens": 584044250.0, + "reward": 0.5390625, + "reward_std": 0.22225630283355713, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999277591705322, + "sampling/importance_sampling_ratio/min": 2.1590203687082976e-05, + "sampling/sampling_logp_difference/max": 10.743270874023438, + "sampling/sampling_logp_difference/mean": 0.018436448648571968, + "step": 662 + }, + { + "clip_ratio/high_max": 3.5268151805212256e-05, + "clip_ratio/high_mean": 9.566726021148497e-06, + "clip_ratio/low_mean": 5.7681085309013724e-05, + "clip_ratio/low_min": 4.5418209992931224e-06, + "clip_ratio/region_mean": 6.724781314915163e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16291.0, + "completions/mean_length": 7106.296875, + "completions/mean_terminated_length": 6487.78369140625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.8079892098903656, + "epoch": 0.609935602575897, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021831525955349207, + "learning_rate": 1e-05, + "loss": 0.1195, + "num_tokens": 584971568.0, + "reward": 0.5625, + "reward_std": 0.32772916555404663, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 8.157488628057763e-05, + "sampling/sampling_logp_difference/max": 9.413989067077637, + "sampling/sampling_logp_difference/mean": 0.018681492656469345, + "step": 663 + }, + { + "clip_ratio/high_max": 4.332071557655581e-05, + "clip_ratio/high_mean": 1.1574332802410936e-05, + "clip_ratio/low_mean": 3.626145735324826e-05, + "clip_ratio/low_min": 3.933786501875147e-06, + "clip_ratio/region_mean": 4.783579004197236e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16363.0, + "completions/mean_length": 7235.046875, + "completions/mean_terminated_length": 7089.82568359375, + "completions/min_length": 1472.0, + "completions/min_terminated_length": 1472.0, + "entropy": 0.8041050210595131, + "epoch": 0.6108555657773689, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004661369137465954, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 585916134.0, + "reward": 0.4375, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000330209732056, + "sampling/importance_sampling_ratio/min": 0.0007107750861905515, + "sampling/sampling_logp_difference/max": 7.249154567718506, + "sampling/sampling_logp_difference/mean": 0.018921509385108948, + "step": 664 + }, + { + "clip_ratio/high_max": 1.4951354842196452e-05, + "clip_ratio/high_mean": 3.737838710549113e-06, + "clip_ratio/low_mean": 2.6745638365355262e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0483477416964888e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13888.0, + "completions/mean_length": 7077.5859375, + "completions/mean_terminated_length": 6777.37890625, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "entropy": 0.8417644873261452, + "epoch": 0.6117755289788408, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024479639250785112, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 586841633.0, + "reward": 0.5, + "reward_std": 0.2001592218875885, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998941421508789, + "sampling/importance_sampling_ratio/min": 0.00028577001648955047, + "sampling/sampling_logp_difference/max": 8.160323143005371, + "sampling/sampling_logp_difference/mean": 0.019227145239710808, + "step": 665 + }, + { + "clip_ratio/high_max": 1.7368187855026918e-05, + "clip_ratio/high_mean": 5.19675950272358e-06, + "clip_ratio/low_mean": 4.123253006582672e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.642928979592398e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 8090.3203125, + "completions/mean_terminated_length": 7463.0673828125, + "completions/min_length": 768.0, + "completions/min_terminated_length": 768.0, + "entropy": 0.7603196427226067, + "epoch": 0.6126954921803128, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005297356750816107, + "learning_rate": 1e-05, + "loss": 0.0352, + "num_tokens": 587897122.0, + "reward": 0.2421875, + "reward_std": 0.27851754426956177, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999694228172302, + "sampling/importance_sampling_ratio/min": 0.0006402728031389415, + "sampling/sampling_logp_difference/max": 7.353616237640381, + "sampling/sampling_logp_difference/mean": 0.018079372122883797, + "step": 666 + }, + { + "clip_ratio/high_max": 1.5767155673529487e-05, + "clip_ratio/high_mean": 3.941788918382372e-06, + "clip_ratio/low_mean": 2.9263440183058265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3205229101440636e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15514.0, + "completions/mean_length": 6908.96875, + "completions/mean_terminated_length": 6360.826171875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.7355617135763168, + "epoch": 0.6136154553817847, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003049109596759081, + "learning_rate": 1e-05, + "loss": 0.0695, + "num_tokens": 588801206.0, + "reward": 0.515625, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999892711639404, + "sampling/importance_sampling_ratio/min": 0.0037962812930345535, + "sampling/sampling_logp_difference/max": 5.573733329772949, + "sampling/sampling_logp_difference/mean": 0.018563130870461464, + "step": 667 + }, + { + "clip_ratio/high_max": 1.725199626889662e-05, + "clip_ratio/high_mean": 4.312999067224155e-06, + "clip_ratio/low_mean": 6.839358093202463e-05, + "clip_ratio/low_min": 9.10438984647044e-06, + "clip_ratio/region_mean": 7.27065794308146e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16333.0, + "completions/mean_length": 7110.109375, + "completions/mean_terminated_length": 6810.951171875, + "completions/min_length": 1008.0, + "completions/min_terminated_length": 1008.0, + "entropy": 0.688617967069149, + "epoch": 0.6145354185832567, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034495368599891663, + "learning_rate": 1e-05, + "loss": 0.1521, + "num_tokens": 589732588.0, + "reward": 0.4296875, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999086856842041, + "sampling/importance_sampling_ratio/min": 0.000573390512727201, + "sampling/sampling_logp_difference/max": 7.4639434814453125, + "sampling/sampling_logp_difference/mean": 0.016679491847753525, + "step": 668 + }, + { + "clip_ratio/high_max": 5.049688752478687e-06, + "clip_ratio/high_mean": 2.31802277994575e-06, + "clip_ratio/low_mean": 5.138145911587344e-05, + "clip_ratio/low_min": 3.9801311686460394e-06, + "clip_ratio/region_mean": 5.369948189581919e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16263.0, + "completions/mean_length": 7533.578125, + "completions/mean_terminated_length": 7021.56982421875, + "completions/min_length": 1321.0, + "completions/min_terminated_length": 1321.0, + "entropy": 0.7306379675865173, + "epoch": 0.6154553817847286, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004971730522811413, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 590717118.0, + "reward": 0.390625, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 4.6860604925313964e-05, + "sampling/sampling_logp_difference/max": 9.96833324432373, + "sampling/sampling_logp_difference/mean": 0.01741175726056099, + "step": 669 + }, + { + "clip_ratio/high_max": 1.3844989325662027e-05, + "clip_ratio/high_mean": 3.4612473314155068e-06, + "clip_ratio/low_mean": 4.160707453593204e-05, + "clip_ratio/low_min": 7.402582014037762e-06, + "clip_ratio/region_mean": 4.506832192419097e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15983.0, + "completions/mean_length": 6165.0, + "completions/mean_terminated_length": 6002.7939453125, + "completions/min_length": 1088.0, + "completions/min_terminated_length": 1088.0, + "entropy": 0.7227498516440392, + "epoch": 0.6163753449862005, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003072877414524555, + "learning_rate": 1e-05, + "loss": 0.0893, + "num_tokens": 591524494.0, + "reward": 0.5703125, + "reward_std": 0.28353992104530334, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 0.00019289882038719952, + "sampling/sampling_logp_difference/max": 8.5533447265625, + "sampling/sampling_logp_difference/mean": 0.016893092542886734, + "step": 670 + }, + { + "clip_ratio/high_max": 3.056439982174197e-05, + "clip_ratio/high_mean": 8.71779502631398e-06, + "clip_ratio/low_mean": 3.8767432329223084e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7485227241850225e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15451.0, + "completions/mean_length": 6061.9375, + "completions/mean_terminated_length": 5728.9677734375, + "completions/min_length": 973.0, + "completions/min_terminated_length": 973.0, + "entropy": 0.813653938472271, + "epoch": 0.6172953081876725, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003992745652794838, + "learning_rate": 1e-05, + "loss": 0.0619, + "num_tokens": 592320726.0, + "reward": 0.578125, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999755620956421, + "sampling/importance_sampling_ratio/min": 7.489924610126764e-05, + "sampling/sampling_logp_difference/max": 9.499366760253906, + "sampling/sampling_logp_difference/mean": 0.018718186765909195, + "step": 671 + }, + { + "clip_ratio/high_max": 1.655339747230755e-05, + "clip_ratio/high_mean": 4.138349368076888e-06, + "clip_ratio/low_mean": 3.851054543702048e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.264889435035002e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16210.0, + "completions/mean_length": 7265.9453125, + "completions/mean_terminated_length": 6658.0751953125, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "entropy": 0.7658502459526062, + "epoch": 0.6182152713891444, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003727070288732648, + "learning_rate": 1e-05, + "loss": 0.1016, + "num_tokens": 593270695.0, + "reward": 0.4921875, + "reward_std": 0.30327796936035156, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999910831451416, + "sampling/importance_sampling_ratio/min": 2.014157189478283e-06, + "sampling/sampling_logp_difference/max": 13.115309715270996, + "sampling/sampling_logp_difference/mean": 0.017805757001042366, + "step": 672 + }, + { + "clip_ratio/high_max": 2.0501698145380942e-05, + "clip_ratio/high_mean": 6.335726652650919e-06, + "clip_ratio/low_mean": 5.263989112336276e-05, + "clip_ratio/low_min": 1.2888257515442092e-05, + "clip_ratio/region_mean": 5.897561732126633e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16355.0, + "completions/mean_length": 8564.046875, + "completions/mean_terminated_length": 7680.0517578125, + "completions/min_length": 968.0, + "completions/min_terminated_length": 968.0, + "entropy": 0.6856872886419296, + "epoch": 0.6191352345906164, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0038730741944164038, + "learning_rate": 1e-05, + "loss": 0.0535, + "num_tokens": 594386261.0, + "reward": 0.4609375, + "reward_std": 0.32483339309692383, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999313354492188, + "sampling/importance_sampling_ratio/min": 0.00017333027790300548, + "sampling/sampling_logp_difference/max": 8.660311698913574, + "sampling/sampling_logp_difference/mean": 0.01785116083920002, + "step": 673 + }, + { + "clip_ratio/high_max": 2.6982705094269477e-05, + "clip_ratio/high_mean": 8.523603241883393e-06, + "clip_ratio/low_mean": 4.970566510564822e-05, + "clip_ratio/low_min": 4.473552507988643e-06, + "clip_ratio/region_mean": 5.82292680064711e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16349.0, + "completions/mean_length": 7838.28125, + "completions/mean_terminated_length": 7343.900390625, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "entropy": 0.636501632630825, + "epoch": 0.6200551977920883, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004014961421489716, + "learning_rate": 1e-05, + "loss": 0.0565, + "num_tokens": 595407313.0, + "reward": 0.46875, + "reward_std": 0.3148210048675537, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 9.145037438429426e-07, + "sampling/sampling_logp_difference/max": 13.904884338378906, + "sampling/sampling_logp_difference/mean": 0.01619477942585945, + "step": 674 + }, + { + "clip_ratio/high_max": 5.649462309520459e-06, + "clip_ratio/high_mean": 1.4123655773801147e-06, + "clip_ratio/low_mean": 2.8467071842896985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.98794374202771e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 6784.5390625, + "completions/mean_terminated_length": 6229.1982421875, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "entropy": 0.6435417085886002, + "epoch": 0.6209751609935602, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004226911347359419, + "learning_rate": 1e-05, + "loss": 0.0855, + "num_tokens": 596291470.0, + "reward": 0.5078125, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999697208404541, + "sampling/importance_sampling_ratio/min": 0.00020356501045171171, + "sampling/sampling_logp_difference/max": 8.49952507019043, + "sampling/sampling_logp_difference/mean": 0.015974994748830795, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.2315146964047017e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2315146964047017e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16122.0, + "completions/mean_length": 7650.2265625, + "completions/mean_terminated_length": 6989.689453125, + "completions/min_length": 1063.0, + "completions/min_terminated_length": 1063.0, + "entropy": 0.7500722259283066, + "epoch": 0.6218951241950322, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031262668780982494, + "learning_rate": 1e-05, + "loss": 0.0675, + "num_tokens": 597291107.0, + "reward": 0.4140625, + "reward_std": 0.2012200653553009, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998708963394165, + "sampling/importance_sampling_ratio/min": 3.9317012578976573e-07, + "sampling/sampling_logp_difference/max": 14.7490234375, + "sampling/sampling_logp_difference/mean": 0.01801086962223053, + "step": 676 + }, + { + "clip_ratio/high_max": 2.2775957177145756e-05, + "clip_ratio/high_mean": 5.693989294286439e-06, + "clip_ratio/low_mean": 5.510050823431811e-05, + "clip_ratio/low_min": 4.993807579012355e-06, + "clip_ratio/region_mean": 6.079449713070062e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15718.0, + "completions/mean_length": 6377.4140625, + "completions/mean_terminated_length": 6298.6220703125, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "entropy": 0.8221950903534889, + "epoch": 0.6228150873965042, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.006345350295305252, + "learning_rate": 1e-05, + "loss": 0.0759, + "num_tokens": 598129568.0, + "reward": 0.46875, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 6.634136661887169e-05, + "sampling/sampling_logp_difference/max": 9.620697021484375, + "sampling/sampling_logp_difference/mean": 0.01888679713010788, + "step": 677 + }, + { + "clip_ratio/high_max": 2.3920926196296932e-05, + "clip_ratio/high_mean": 7.139227250263502e-06, + "clip_ratio/low_mean": 5.5144641464721644e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.228386882867198e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14250.0, + "completions/mean_length": 5567.2578125, + "completions/mean_terminated_length": 5218.33056640625, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.7284790053963661, + "epoch": 0.6237350505979761, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003562809666618705, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 598862361.0, + "reward": 0.5703125, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999499320983887, + "sampling/importance_sampling_ratio/min": 2.3016077932425105e-07, + "sampling/sampling_logp_difference/max": 15.2844877243042, + "sampling/sampling_logp_difference/mean": 0.016367387026548386, + "step": 678 + }, + { + "clip_ratio/high_max": 1.4490571629721671e-05, + "clip_ratio/high_mean": 4.364888013697055e-06, + "clip_ratio/low_mean": 2.498499657122011e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.934988481229084e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15202.0, + "completions/mean_length": 8128.375, + "completions/mean_terminated_length": 7578.00048828125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.7838430106639862, + "epoch": 0.624655013799448, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031477995216846466, + "learning_rate": 1e-05, + "loss": 0.0517, + "num_tokens": 599921233.0, + "reward": 0.34375, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 0.00018987487419508398, + "sampling/sampling_logp_difference/max": 8.569145202636719, + "sampling/sampling_logp_difference/mean": 0.019213391467928886, + "step": 679 + }, + { + "clip_ratio/high_max": 2.650051692398847e-05, + "clip_ratio/high_mean": 8.023214263630507e-06, + "clip_ratio/low_mean": 3.322141196804296e-05, + "clip_ratio/low_min": 2.5509161787340418e-06, + "clip_ratio/region_mean": 4.124462532217876e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15879.0, + "completions/mean_length": 7452.296875, + "completions/mean_terminated_length": 7013.0322265625, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "entropy": 0.8657966181635857, + "epoch": 0.62557497700092, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034168637357652187, + "learning_rate": 1e-05, + "loss": 0.0896, + "num_tokens": 600895023.0, + "reward": 0.296875, + "reward_std": 0.3061561584472656, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999901056289673, + "sampling/importance_sampling_ratio/min": 0.0003922602627426386, + "sampling/sampling_logp_difference/max": 7.843585014343262, + "sampling/sampling_logp_difference/mean": 0.019955754280090332, + "step": 680 + }, + { + "clip_ratio/high_max": 8.234628239733865e-06, + "clip_ratio/high_mean": 2.0586570599334664e-06, + "clip_ratio/low_mean": 5.516502255886735e-05, + "clip_ratio/low_min": 5.772084023192292e-06, + "clip_ratio/region_mean": 5.7223681096729706e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15759.0, + "completions/mean_length": 7581.625, + "completions/mean_terminated_length": 7072.396484375, + "completions/min_length": 1686.0, + "completions/min_terminated_length": 1686.0, + "entropy": 0.764233261346817, + "epoch": 0.6264949402023919, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026859277859330177, + "learning_rate": 1e-05, + "loss": 0.105, + "num_tokens": 601887935.0, + "reward": 0.421875, + "reward_std": 0.3295465111732483, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.029503032565116882, + "sampling/sampling_logp_difference/max": 3.5232622623443604, + "sampling/sampling_logp_difference/mean": 0.018653862178325653, + "step": 681 + }, + { + "clip_ratio/high_max": 2.654059608175885e-05, + "clip_ratio/high_mean": 6.635149020439712e-06, + "clip_ratio/low_mean": 5.129833289174712e-05, + "clip_ratio/low_min": 5.234505806583911e-06, + "clip_ratio/region_mean": 5.793348100269213e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16279.0, + "completions/mean_length": 8824.2421875, + "completions/mean_terminated_length": 8452.4501953125, + "completions/min_length": 1991.0, + "completions/min_terminated_length": 1991.0, + "entropy": 0.7557987719774246, + "epoch": 0.6274149034038639, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002624326851218939, + "learning_rate": 1e-05, + "loss": 0.0491, + "num_tokens": 603035462.0, + "reward": 0.328125, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999333024024963, + "sampling/importance_sampling_ratio/min": 5.1453887863317505e-05, + "sampling/sampling_logp_difference/max": 9.874824523925781, + "sampling/sampling_logp_difference/mean": 0.01799936406314373, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1395032920045196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1395032920045196e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16189.0, + "completions/mean_length": 5832.7890625, + "completions/mean_terminated_length": 5749.70849609375, + "completions/min_length": 948.0, + "completions/min_terminated_length": 948.0, + "entropy": 0.8034545630216599, + "epoch": 0.6283348666053358, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005783884786069393, + "learning_rate": 1e-05, + "loss": 0.0796, + "num_tokens": 603801083.0, + "reward": 0.5234375, + "reward_std": 0.27092626690864563, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000687837600708, + "sampling/importance_sampling_ratio/min": 0.033374395221471786, + "sampling/sampling_logp_difference/max": 3.399966239929199, + "sampling/sampling_logp_difference/mean": 0.01805710420012474, + "step": 683 + }, + { + "clip_ratio/high_max": 2.2193052700458793e-05, + "clip_ratio/high_mean": 6.736250270478195e-06, + "clip_ratio/low_mean": 5.521000275621191e-05, + "clip_ratio/low_min": 9.064021924132248e-06, + "clip_ratio/region_mean": 6.19462530266901e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 7247.4609375, + "completions/mean_terminated_length": 7102.43701171875, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.908146396279335, + "epoch": 0.6292548298068077, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.005038067698478699, + "learning_rate": 1e-05, + "loss": 0.0832, + "num_tokens": 604748150.0, + "reward": 0.46875, + "reward_std": 0.43106767535209656, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.0030831864569336176, + "sampling/sampling_logp_difference/max": 5.781791687011719, + "sampling/sampling_logp_difference/mean": 0.01983889564871788, + "step": 684 + }, + { + "clip_ratio/high_max": 8.630155889477464e-06, + "clip_ratio/high_mean": 2.157538972369366e-06, + "clip_ratio/low_mean": 6.599987852951017e-05, + "clip_ratio/low_min": 1.7551100199852954e-05, + "clip_ratio/region_mean": 6.815741778609663e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15519.0, + "completions/mean_length": 6861.078125, + "completions/mean_terminated_length": 6473.96728515625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "entropy": 0.7612876370549202, + "epoch": 0.6301747930082797, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0053928992711007595, + "learning_rate": 1e-05, + "loss": 0.0967, + "num_tokens": 605642768.0, + "reward": 0.5078125, + "reward_std": 0.40503159165382385, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 4.585228089126758e-05, + "sampling/sampling_logp_difference/max": 9.99008560180664, + "sampling/sampling_logp_difference/mean": 0.018197370693087578, + "step": 685 + }, + { + "clip_ratio/high_max": 2.531879181333352e-05, + "clip_ratio/high_mean": 6.32969795333338e-06, + "clip_ratio/low_mean": 5.132838714416721e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.765808464275324e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 6837.8203125, + "completions/mean_terminated_length": 6201.40869140625, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.6217481270432472, + "epoch": 0.6310947562097516, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0032709913793951273, + "learning_rate": 1e-05, + "loss": 0.1155, + "num_tokens": 606534577.0, + "reward": 0.484375, + "reward_std": 0.2567248046398163, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999561309814453, + "sampling/importance_sampling_ratio/min": 4.2650382965803146e-05, + "sampling/sampling_logp_difference/max": 10.062474250793457, + "sampling/sampling_logp_difference/mean": 0.016331009566783905, + "step": 686 + }, + { + "clip_ratio/high_max": 1.0992388070008019e-05, + "clip_ratio/high_mean": 3.581897317417315e-06, + "clip_ratio/low_mean": 5.021198876420385e-05, + "clip_ratio/low_min": 4.219409220240777e-06, + "clip_ratio/region_mean": 5.379388539950014e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15103.0, + "completions/mean_length": 6458.703125, + "completions/mean_terminated_length": 6380.55126953125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.7460968196392059, + "epoch": 0.6320147194112236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002640153281390667, + "learning_rate": 1e-05, + "loss": 0.0581, + "num_tokens": 607381811.0, + "reward": 0.4140625, + "reward_std": 0.2382800281047821, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 8.858721116666857e-07, + "sampling/sampling_logp_difference/max": 13.93669319152832, + "sampling/sampling_logp_difference/mean": 0.017693117260932922, + "step": 687 + }, + { + "clip_ratio/high_max": 1.2546400967039517e-05, + "clip_ratio/high_mean": 3.1366002417598793e-06, + "clip_ratio/low_mean": 6.0473582834674744e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.361018404277274e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16121.0, + "completions/mean_length": 7043.1640625, + "completions/mean_terminated_length": 6894.8974609375, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 0.7884078621864319, + "epoch": 0.6329346826126955, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003713687416166067, + "learning_rate": 1e-05, + "loss": 0.0635, + "num_tokens": 608302256.0, + "reward": 0.390625, + "reward_std": 0.2648528814315796, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999112486839294, + "sampling/importance_sampling_ratio/min": 9.931326871992496e-08, + "sampling/sampling_logp_difference/max": 16.12498664855957, + "sampling/sampling_logp_difference/mean": 0.019254781305789948, + "step": 688 + }, + { + "clip_ratio/high_max": 7.887592573752045e-06, + "clip_ratio/high_mean": 1.971898143438011e-06, + "clip_ratio/low_mean": 4.4303845015747356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6275743216028786e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15487.0, + "completions/mean_length": 8012.8359375, + "completions/mean_terminated_length": 7742.79833984375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.8368816301226616, + "epoch": 0.6338546458141674, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004894682671874762, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 609348299.0, + "reward": 0.4296875, + "reward_std": 0.3027411997318268, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000343322753906, + "sampling/importance_sampling_ratio/min": 0.0021496599074453115, + "sampling/sampling_logp_difference/max": 6.1424455642700195, + "sampling/sampling_logp_difference/mean": 0.01958826184272766, + "step": 689 + }, + { + "clip_ratio/high_max": 1.0690811450331239e-05, + "clip_ratio/high_mean": 2.6727028625828098e-06, + "clip_ratio/low_mean": 3.859445814669016e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1267160668212455e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16065.0, + "completions/mean_length": 7594.3671875, + "completions/mean_terminated_length": 7008.39208984375, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.692665733397007, + "epoch": 0.6347746090156394, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0039004215504974127, + "learning_rate": 1e-05, + "loss": 0.0574, + "num_tokens": 610341090.0, + "reward": 0.3984375, + "reward_std": 0.3284856975078583, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999041557312012, + "sampling/importance_sampling_ratio/min": 4.006533345091157e-05, + "sampling/sampling_logp_difference/max": 10.124999046325684, + "sampling/sampling_logp_difference/mean": 0.01734849065542221, + "step": 690 + }, + { + "clip_ratio/high_max": 4.406994776218198e-06, + "clip_ratio/high_mean": 2.7999831218039617e-06, + "clip_ratio/low_mean": 5.9335616697353544e-05, + "clip_ratio/low_min": 5.472375505632954e-06, + "clip_ratio/region_mean": 6.21355998191575e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 7640.09375, + "completions/mean_terminated_length": 7358.0322265625, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.8469130471348763, + "epoch": 0.6356945722171113, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004913663491606712, + "learning_rate": 1e-05, + "loss": 0.0782, + "num_tokens": 611339726.0, + "reward": 0.359375, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998912811279297, + "sampling/importance_sampling_ratio/min": 4.2071459205317296e-08, + "sampling/sampling_logp_difference/max": 16.983896255493164, + "sampling/sampling_logp_difference/mean": 0.019604282453656197, + "step": 691 + }, + { + "clip_ratio/high_max": 1.4971937162044924e-05, + "clip_ratio/high_mean": 5.209913979342673e-06, + "clip_ratio/low_mean": 2.7830240469484124e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.304015490357415e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15895.0, + "completions/max_terminated_length": 15895.0, + "completions/mean_length": 5063.6953125, + "completions/mean_terminated_length": 5063.6953125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.7586102113127708, + "epoch": 0.6366145354185833, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0032354791183024645, + "learning_rate": 1e-05, + "loss": 0.0371, + "num_tokens": 612005495.0, + "reward": 0.59375, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999009370803833, + "sampling/importance_sampling_ratio/min": 0.02037520334124565, + "sampling/sampling_logp_difference/max": 3.8934366703033447, + "sampling/sampling_logp_difference/mean": 0.0178166925907135, + "step": 692 + }, + { + "clip_ratio/high_max": 2.1337797079468146e-05, + "clip_ratio/high_mean": 5.3344492698670365e-06, + "clip_ratio/low_mean": 1.1576638144106255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.691108741397329e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14868.0, + "completions/mean_length": 6542.1640625, + "completions/mean_terminated_length": 6385.94482421875, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.847448967397213, + "epoch": 0.6375344986200552, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004039868246763945, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 612870060.0, + "reward": 0.453125, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998836517333984, + "sampling/importance_sampling_ratio/min": 2.2897740994953786e-11, + "sampling/sampling_logp_difference/max": 24.499982833862305, + "sampling/sampling_logp_difference/mean": 0.019780561327934265, + "step": 693 + }, + { + "clip_ratio/high_max": 6.333826149784727e-06, + "clip_ratio/high_mean": 1.5834565374461818e-06, + "clip_ratio/low_mean": 3.4833526569855167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.641698299361451e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16287.0, + "completions/mean_length": 5805.8203125, + "completions/mean_terminated_length": 5551.9443359375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.6972410827875137, + "epoch": 0.6384544618215271, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023007066920399666, + "learning_rate": 1e-05, + "loss": 0.0632, + "num_tokens": 613633581.0, + "reward": 0.609375, + "reward_std": 0.23857943713665009, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000262260437012, + "sampling/importance_sampling_ratio/min": 0.00026135475491173565, + "sampling/sampling_logp_difference/max": 8.249631881713867, + "sampling/sampling_logp_difference/mean": 0.016993921250104904, + "step": 694 + }, + { + "clip_ratio/high_max": 6.643952701779199e-06, + "clip_ratio/high_mean": 1.6609881754447997e-06, + "clip_ratio/low_mean": 1.501361566624837e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.667460389853659e-05, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16249.0, + "completions/mean_length": 7504.65625, + "completions/mean_terminated_length": 6586.103515625, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "entropy": 0.7908455803990364, + "epoch": 0.6393744250229991, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0029130352195352316, + "learning_rate": 1e-05, + "loss": 0.0413, + "num_tokens": 614611881.0, + "reward": 0.3671875, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999863862991333, + "sampling/importance_sampling_ratio/min": 2.025089543167269e-06, + "sampling/sampling_logp_difference/max": 13.109896659851074, + "sampling/sampling_logp_difference/mean": 0.018666472285985947, + "step": 695 + }, + { + "clip_ratio/high_max": 1.817479960664059e-05, + "clip_ratio/high_mean": 4.543699901660148e-06, + "clip_ratio/low_mean": 5.670640712196473e-05, + "clip_ratio/low_min": 6.148246484372066e-06, + "clip_ratio/region_mean": 6.125010668256436e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13984.0, + "completions/max_terminated_length": 13984.0, + "completions/mean_length": 5627.265625, + "completions/mean_terminated_length": 5627.265625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.7167766839265823, + "epoch": 0.640294388224471, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0020515238866209984, + "learning_rate": 1e-05, + "loss": 0.1054, + "num_tokens": 615355915.0, + "reward": 0.421875, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999716877937317, + "sampling/importance_sampling_ratio/min": 0.002808797173202038, + "sampling/sampling_logp_difference/max": 5.874999046325684, + "sampling/sampling_logp_difference/mean": 0.01694992370903492, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.3280599786376115e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3280599786376115e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14713.0, + "completions/mean_length": 6129.9140625, + "completions/mean_terminated_length": 5967.1513671875, + "completions/min_length": 1201.0, + "completions/min_terminated_length": 1201.0, + "entropy": 0.7654511705040932, + "epoch": 0.641214351425943, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003425017697736621, + "learning_rate": 1e-05, + "loss": 0.0428, + "num_tokens": 616159416.0, + "reward": 0.5546875, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.005587513092905283, + "sampling/sampling_logp_difference/max": 5.187221050262451, + "sampling/sampling_logp_difference/mean": 0.01828661933541298, + "step": 697 + }, + { + "clip_ratio/high_max": 2.1838685825059656e-05, + "clip_ratio/high_mean": 5.459671456264914e-06, + "clip_ratio/low_mean": 3.4785461366482195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.024513225431292e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16340.0, + "completions/mean_length": 7219.078125, + "completions/mean_terminated_length": 7146.91357421875, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.847568191587925, + "epoch": 0.6421343146274149, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.005707201547920704, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 617101738.0, + "reward": 0.53125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 2.5612937406549463e-06, + "sampling/sampling_logp_difference/max": 12.874998092651367, + "sampling/sampling_logp_difference/mean": 0.01983051374554634, + "step": 698 + }, + { + "clip_ratio/high_max": 2.676450185390422e-05, + "clip_ratio/high_mean": 8.55213056638604e-06, + "clip_ratio/low_mean": 5.492671812135086e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.347884914248425e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14972.0, + "completions/mean_length": 6116.96875, + "completions/mean_terminated_length": 5870.56005859375, + "completions/min_length": 1371.0, + "completions/min_terminated_length": 1371.0, + "entropy": 0.7148991823196411, + "epoch": 0.6430542778288868, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.004018646199256182, + "learning_rate": 1e-05, + "loss": 0.078, + "num_tokens": 617903030.0, + "reward": 0.5546875, + "reward_std": 0.2569621503353119, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999732971191406, + "sampling/importance_sampling_ratio/min": 0.00015846146561671048, + "sampling/sampling_logp_difference/max": 8.749999046325684, + "sampling/sampling_logp_difference/mean": 0.017638593912124634, + "step": 699 + }, + { + "clip_ratio/high_max": 3.844970706268214e-06, + "clip_ratio/high_mean": 1.9004990008397726e-06, + "clip_ratio/low_mean": 7.103690825260855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.29374083903167e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15232.0, + "completions/mean_length": 7486.515625, + "completions/mean_terminated_length": 7272.9765625, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.7912377193570137, + "epoch": 0.6439742410303588, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00282766274176538, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 618880312.0, + "reward": 0.4453125, + "reward_std": 0.32089442014694214, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999787211418152, + "sampling/importance_sampling_ratio/min": 0.0001030677231028676, + "sampling/sampling_logp_difference/max": 9.180124282836914, + "sampling/sampling_logp_difference/mean": 0.01940794661641121, + "step": 700 + }, + { + "clip_ratio/high_max": 2.241842275907402e-05, + "clip_ratio/high_mean": 6.616161613237637e-06, + "clip_ratio/low_mean": 3.103233757428825e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.76484995285864e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13644.0, + "completions/mean_length": 7297.453125, + "completions/mean_terminated_length": 6610.23583984375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.8420139253139496, + "epoch": 0.6448942042318307, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016839519375935197, + "learning_rate": 1e-05, + "loss": 0.0438, + "num_tokens": 619834002.0, + "reward": 0.3359375, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999658465385437, + "sampling/importance_sampling_ratio/min": 0.0005040382966399193, + "sampling/sampling_logp_difference/max": 7.59285831451416, + "sampling/sampling_logp_difference/mean": 0.019356656819581985, + "step": 701 + }, + { + "clip_ratio/high_max": 9.791850970941596e-06, + "clip_ratio/high_mean": 2.447962742735399e-06, + "clip_ratio/low_mean": 4.7923438614816405e-05, + "clip_ratio/low_min": 3.219243353669299e-06, + "clip_ratio/region_mean": 5.0371401357551804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15471.0, + "completions/mean_length": 5935.53125, + "completions/mean_terminated_length": 5684.76806640625, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.6855737417936325, + "epoch": 0.6458141674333027, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00550073804333806, + "learning_rate": 1e-05, + "loss": 0.0822, + "num_tokens": 620615054.0, + "reward": 0.4609375, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000221729278564, + "sampling/importance_sampling_ratio/min": 2.4300854420289397e-05, + "sampling/sampling_logp_difference/max": 10.624999046325684, + "sampling/sampling_logp_difference/mean": 0.01712688058614731, + "step": 702 + }, + { + "clip_ratio/high_max": 1.3569449947681278e-05, + "clip_ratio/high_mean": 3.3923624869203195e-06, + "clip_ratio/low_mean": 2.6169475859205704e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.95618385734997e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14395.0, + "completions/mean_length": 6016.0625, + "completions/mean_terminated_length": 5851.4921875, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "entropy": 0.7685846760869026, + "epoch": 0.6467341306347746, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.005174044985324144, + "learning_rate": 1e-05, + "loss": 0.0922, + "num_tokens": 621407854.0, + "reward": 0.4453125, + "reward_std": 0.25330984592437744, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999493956565857, + "sampling/importance_sampling_ratio/min": 3.535773794283159e-05, + "sampling/sampling_logp_difference/max": 10.249993324279785, + "sampling/sampling_logp_difference/mean": 0.017704609781503677, + "step": 703 + }, + { + "clip_ratio/high_max": 8.932004220696399e-06, + "clip_ratio/high_mean": 2.2330010551740997e-06, + "clip_ratio/low_mean": 3.712984198500635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.936284304018045e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15771.0, + "completions/mean_length": 6402.3984375, + "completions/mean_terminated_length": 6323.80322265625, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.8285454586148262, + "epoch": 0.6476540938362465, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022989478893578053, + "learning_rate": 1e-05, + "loss": 0.1083, + "num_tokens": 622246633.0, + "reward": 0.4453125, + "reward_std": 0.32089439034461975, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479055404663, + "sampling/importance_sampling_ratio/min": 4.360687739790592e-07, + "sampling/sampling_logp_difference/max": 14.645465850830078, + "sampling/sampling_logp_difference/mean": 0.01977568492293358, + "step": 704 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 622246633, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_milora_plus_20251201_131939/checkpoint-704/zero_to_fp32.py b/dapo_milora_plus_20251201_131939/checkpoint-704/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_milora_plus_20251201_131939/checkpoint-704/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_milora_plus_20251201_131939/output.log b/dapo_milora_plus_20251201_131939/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2697349482dbf063b4861232778ab4639af76e6f --- /dev/null +++ b/dapo_milora_plus_20251201_131939/output.log @@ -0,0 +1,8813 @@ +W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] +W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] ***************************************** +W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W1201 13:19:59.658000 1171813 torch/distributed/run.py:774] ***************************************** +INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda. +INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda. +INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda. +INFO 12-01 13:20:21 [__init__.py:216] Automatically detected platform cuda. +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) + +[OpenTinker] 2025-12-01 13:20:26,167 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it +[OpenTinker] 2025-12-01 13:20:26,167 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it +[OpenTinker] 2025-12-01 13:20:26,167 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='milora_plus', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_milora_plus_20251201_131939', run_name='outputs/dapo_milora_plus_20251201_131939', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-01 13:20:26,169 - root - INFO - Output directory outputs/dapo_milora_plus_20251201_131939 already exists, using it +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: setting up run ruhht7fc +wandb: setting up run f7ojo7cc +wandb: setting up run 56v55mci +wandb: setting up run 79eq2874 +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-79eq2874 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_milora_plus_20251201_131939 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/79eq2874 +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-ruhht7fc +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_milora_plus_20251201_131939 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/ruhht7fc +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-f7ojo7cc +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_milora_plus_20251201_131939 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/f7ojo7cc +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251201_132029-56v55mci +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_milora_plus_20251201_131939 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/56v55mci +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-01 13:20:31,864 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-01 13:20:31,962 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-01 13:20:31,962 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-01 13:20:32,107 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-01 13:20:32,107 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-01 13:20:33,070 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-01 13:20:33,133 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-01 13:20:33,152 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-01 13:20:33,507 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-01 13:20:36,261 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-01 13:20:36,268 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-01 13:20:36,448 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +`torch_dtype` is deprecated! Use `dtype` instead! +[OpenTinker] 2025-12-01 13:20:36,623 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +[OpenTinker] 2025-12-01 13:20:37,520 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-01 13:20:37,520 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-01 13:20:37,635 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-01 13:20:37,635 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-01 13:20:37,706 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization... +[OpenTinker] 2025-12-01 13:20:37,706 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace +[OpenTinker] 2025-12-01 13:20:37,749 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-01 13:20:37,749 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-01 13:20:37,791 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization... +[OpenTinker] 2025-12-01 13:20:37,791 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace +[OpenTinker] 2025-12-01 13:20:37,897 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-01 13:20:37,897 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-01 13:20:37,902 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization... +[OpenTinker] 2025-12-01 13:20:37,903 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace +[OpenTinker] 2025-12-01 13:20:38,050 - perl.lora.milora_plus - INFO - Starting MiLoRA++ (Direction-Only) initialization... +[OpenTinker] 2025-12-01 13:20:38,051 - perl.lora.milora_plus - INFO - Mode: min | Rank: 16 | Target: Off-Principal Subspace +[OpenTinker] 2025-12-01 13:20:38,417 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:38,448 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:38,478 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:38,513 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:38,545 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:38,575 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:38,630 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:38,662 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:38,692 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:38,763 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:38,795 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:38,825 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:40,266 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:40,387 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:40,512 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:40,621 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:45,341 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:45,556 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:45,736 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:45,798 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.0.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:46,049 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:46,080 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:46,111 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:46,273 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:46,303 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:46,334 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:46,446 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:46,477 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:46,507 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:46,524 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:46,554 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:46,586 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:47,887 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:48,130 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:48,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:48,395 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:52,959 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:53,290 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:53,388 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:53,646 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.1.mlp.down_proj +[OpenTinker] 2025-12-01 13:20:53,669 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:53,700 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:53,732 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:54,009 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:54,040 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:54,070 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:54,099 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:54,130 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:54,161 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:54,376 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.q_proj +[OpenTinker] 2025-12-01 13:20:54,407 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.k_proj +[OpenTinker] 2025-12-01 13:20:54,438 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.self_attn.v_proj +[OpenTinker] 2025-12-01 13:20:55,507 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:55,866 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:55,936 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj +[OpenTinker] 2025-12-01 13:20:56,245 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:00,574 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:01,031 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:01,034 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:01,289 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:01,320 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:01,349 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:01,516 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.2.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:01,745 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:01,751 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:01,776 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:01,782 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:01,806 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:01,814 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:02,245 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:02,276 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:02,306 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:03,142 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:03,596 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:03,625 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:04,123 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:08,214 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:08,687 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:08,795 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:08,926 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:08,956 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:08,986 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:09,366 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.3.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:09,395 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:09,426 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:09,456 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:09,516 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:09,547 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:09,576 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:10,096 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:10,126 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:10,156 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:10,766 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:11,232 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:11,378 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:11,960 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:15,801 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:16,318 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:16,505 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:16,522 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:16,535 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:16,565 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:17,026 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:17,057 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:17,087 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:17,216 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.4.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:17,240 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:17,271 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:17,300 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:17,945 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:17,975 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:18,006 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:18,341 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:18,864 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:19,094 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:19,814 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:23,392 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:23,957 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:24,102 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:24,133 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:24,163 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:24,247 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:24,659 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:24,690 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:24,720 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:24,967 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:24,998 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:25,028 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:25,049 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.5.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:25,769 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:25,800 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:25,829 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:25,943 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:26,500 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:26,819 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:27,635 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:31,007 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:31,578 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:31,711 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:31,741 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:31,771 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:31,965 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:32,283 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:32,313 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:32,342 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:32,683 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:32,714 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:32,744 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:32,888 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.6.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:33,555 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:33,612 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:33,643 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:33,673 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:34,130 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:34,532 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:35,466 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:38,606 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:39,186 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:39,319 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:39,350 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:39,380 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:39,674 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:39,894 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:39,925 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:39,954 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:40,397 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:40,428 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:40,457 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:40,583 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.7.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:41,162 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:41,299 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:41,329 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:41,359 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:41,736 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:42,254 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:43,155 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:46,217 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:46,796 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:46,922 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:46,954 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:46,984 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:47,441 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:47,504 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:47,534 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:47,564 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:48,164 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:48,195 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:48,225 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:48,272 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.8.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:48,766 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:48,987 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:49,017 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:49,047 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:49,351 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:50,024 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:50,845 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:53,809 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:54,398 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:54,522 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:54,552 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:54,583 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:55,106 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:55,139 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:55,170 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:55,199 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:55,915 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:55,943 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.9.mlp.down_proj +[OpenTinker] 2025-12-01 13:21:55,945 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:55,975 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:56,358 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:56,651 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.q_proj +[OpenTinker] 2025-12-01 13:21:56,681 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.k_proj +[OpenTinker] 2025-12-01 13:21:56,712 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.self_attn.v_proj +[OpenTinker] 2025-12-01 13:21:56,944 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:57,769 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj +[OpenTinker] 2025-12-01 13:21:58,506 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:01,409 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:01,987 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:02,115 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:02,146 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:02,178 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:02,689 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:02,719 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:02,750 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:02,920 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:03,589 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.10.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:03,635 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:03,665 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:03,695 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:03,954 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:04,294 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:04,324 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:04,354 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:04,537 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:05,484 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:06,135 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:08,981 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:09,589 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:09,678 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:09,708 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:09,738 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:10,286 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:10,316 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:10,346 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:10,647 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:11,246 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.11.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:11,361 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:11,391 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:11,421 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:11,520 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:11,949 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:11,978 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:12,008 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:12,134 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:13,210 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:13,802 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:16,572 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:17,177 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:17,285 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:17,314 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:17,344 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:17,881 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:17,911 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:17,940 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:18,370 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:18,904 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.12.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:19,092 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:19,122 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:19,128 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:19,152 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:19,614 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:19,644 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:19,674 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:19,721 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:20,944 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:21,464 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:24,184 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:24,777 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:24,886 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:24,915 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:24,945 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:25,476 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:25,505 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:25,535 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:26,097 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:26,554 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.13.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:26,725 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:26,812 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:26,842 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:26,872 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:27,257 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:27,287 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:27,314 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:27,317 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:28,663 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:29,118 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:31,794 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:32,354 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:32,495 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:32,525 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:32,555 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:33,054 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:33,084 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:33,113 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:33,818 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:34,221 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.14.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:34,354 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:34,533 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:34,563 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:34,593 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:34,902 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:34,926 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:34,956 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:34,986 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:36,390 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:36,779 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:39,409 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:39,942 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:40,112 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:40,142 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:40,171 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:40,643 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:40,672 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:40,702 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:41,557 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:41,885 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.15.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:41,958 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:42,275 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:42,306 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:42,335 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:42,484 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:42,593 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:42,623 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:42,654 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:44,135 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:44,451 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:47,021 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:47,550 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:47,726 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:47,756 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:47,788 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:48,259 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:48,289 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:48,319 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:49,294 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:49,572 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:49,596 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.16.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:50,013 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:50,044 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:50,073 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:50,099 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:50,308 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:50,339 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:50,369 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:51,866 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:52,170 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:54,634 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:55,161 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:55,338 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:55,368 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:55,398 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:55,866 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:55,896 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:55,925 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:57,024 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:57,194 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:57,297 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.17.mlp.down_proj +[OpenTinker] 2025-12-01 13:22:57,702 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:57,740 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:57,770 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:57,800 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:58,009 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.q_proj +[OpenTinker] 2025-12-01 13:22:58,039 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.k_proj +[OpenTinker] 2025-12-01 13:22:58,069 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.self_attn.v_proj +[OpenTinker] 2025-12-01 13:22:59,587 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj +[OpenTinker] 2025-12-01 13:22:59,862 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:02,256 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:02,756 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:02,960 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:02,990 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:03,020 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:03,459 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:03,489 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:03,519 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:04,745 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:04,804 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:05,000 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.18.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:05,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:05,469 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:05,499 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:05,530 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:05,711 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:05,741 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:05,771 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:07,326 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:07,565 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:09,855 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:10,351 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:10,554 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:10,585 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:10,614 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:11,054 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:11,084 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:11,114 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:12,406 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:12,487 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:12,683 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.19.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:12,895 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:13,203 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:13,232 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:13,263 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:13,390 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:13,420 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:13,449 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:15,059 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:15,249 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:17,476 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:17,963 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:18,177 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:18,207 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:18,237 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:18,668 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:18,698 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:18,728 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:20,022 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:20,242 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:20,414 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.20.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:20,509 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:20,958 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:20,988 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:21,017 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:21,122 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:21,152 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:21,181 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:22,810 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:22,968 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:25,088 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:25,567 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:25,797 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:25,827 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:25,857 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:26,271 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:26,301 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:26,331 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:27,644 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:27,980 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:28,107 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.21.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:28,114 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:28,699 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:28,729 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:28,759 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:28,817 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:28,847 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:28,876 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:30,560 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:30,688 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:32,718 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:33,176 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:33,421 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:33,451 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:33,481 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:33,882 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:33,912 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:33,941 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:35,262 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:35,726 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:35,732 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:35,819 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.22.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:36,445 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:36,476 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:36,505 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:36,532 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:36,561 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:36,591 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:38,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:38,390 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:40,348 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:40,808 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:41,057 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:41,087 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:41,117 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:41,522 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:41,552 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:41,582 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:42,893 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:43,375 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:43,483 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:43,536 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.23.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:44,203 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:44,234 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:44,252 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:44,263 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:44,283 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:44,312 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:46,055 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:46,114 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:47,952 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:48,435 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:48,657 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:48,687 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:48,716 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:49,139 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:49,169 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:49,199 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:50,499 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:50,994 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:51,236 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:51,267 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.24.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:51,953 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:51,979 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:51,983 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:52,009 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:52,013 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:52,039 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:53,818 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:53,834 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:55,560 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:56,058 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:56,266 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:56,296 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:56,328 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:56,761 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:56,791 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:56,821 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:58,108 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:58,607 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj +[OpenTinker] 2025-12-01 13:23:58,983 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:59,001 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.25.mlp.down_proj +[OpenTinker] 2025-12-01 13:23:59,693 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:59,713 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.q_proj +[OpenTinker] 2025-12-01 13:23:59,723 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:59,744 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.k_proj +[OpenTinker] 2025-12-01 13:23:59,753 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj +[OpenTinker] 2025-12-01 13:23:59,773 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.self_attn.v_proj +[OpenTinker] 2025-12-01 13:24:01,551 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj +[OpenTinker] 2025-12-01 13:24:01,577 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.up_proj +[OpenTinker] 2025-12-01 13:24:03,165 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:03,681 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:03,865 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj +[OpenTinker] 2025-12-01 13:24:03,894 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj +[OpenTinker] 2025-12-01 13:24:03,924 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj +[OpenTinker] 2025-12-01 13:24:04,381 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj +[OpenTinker] 2025-12-01 13:24:04,411 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj +[OpenTinker] 2025-12-01 13:24:04,441 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj +[OpenTinker] 2025-12-01 13:24:05,693 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj +[OpenTinker] 2025-12-01 13:24:06,229 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj +[OpenTinker] 2025-12-01 13:24:06,672 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:06,749 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.26.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:07,380 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj +[OpenTinker] 2025-12-01 13:24:07,410 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj +[OpenTinker] 2025-12-01 13:24:07,439 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj +[OpenTinker] 2025-12-01 13:24:07,460 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.q_proj +[OpenTinker] 2025-12-01 13:24:07,490 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.k_proj +[OpenTinker] 2025-12-01 13:24:07,520 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.self_attn.v_proj +[OpenTinker] 2025-12-01 13:24:09,226 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj +[OpenTinker] 2025-12-01 13:24:09,313 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.up_proj +[OpenTinker] 2025-12-01 13:24:10,752 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:10,753 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 213.05 seconds +[OpenTinker] 2025-12-01 13:24:10,754 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-01 13:24:11,252 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpxmtxyw6d/test.c -o /tmp/tmpxmtxyw6d/test.o +[OpenTinker] 2025-12-01 13:24:11,280 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpxmtxyw6d/test.o -laio -o /tmp/tmpxmtxyw6d/a.out +[OpenTinker] 2025-12-01 13:24:11,298 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:11,299 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 213.25 seconds +[OpenTinker] 2025-12-01 13:24:11,300 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-01 13:24:11,606 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpdzsydi46/test.c -o /tmp/tmpdzsydi46/test.o +[OpenTinker] 2025-12-01 13:24:11,632 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpdzsydi46/test.o -laio -o /tmp/tmpdzsydi46/a.out +[OpenTinker] 2025-12-01 13:24:11,778 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp14hhwhxl/test.c -o /tmp/tmp14hhwhxl/test.o +[OpenTinker] 2025-12-01 13:24:11,806 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp14hhwhxl/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp14hhwhxl/a.out +[OpenTinker] 2025-12-01 13:24:12,087 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpxl1e93d0/test.c -o /tmp/tmpxl1e93d0/test.o +[OpenTinker] 2025-12-01 13:24:12,121 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpxl1e93d0/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpxl1e93d0/a.out +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +[OpenTinker] 2025-12-01 13:24:14,348 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:14,348 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 216.45 seconds +[OpenTinker] 2025-12-01 13:24:14,349 - root - INFO - Lora configured successfully +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Using network Socket +[OpenTinker] 2025-12-01 13:24:14,492 - perl.lora.milora_plus - INFO - Processed: base_model.model.model.layers.27.mlp.down_proj +[OpenTinker] 2025-12-01 13:24:14,493 - perl.lora.milora_plus - INFO - MiLoRA++ initialization completed in 216.70 seconds +[OpenTinker] 2025-12-01 13:24:14,493 - root - INFO - Lora configured successfully +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO ncclCommInitRankConfig comm 0x16a41420 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0xb34e3953a9afc2f1 - Init START +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO ncclCommInitRankConfig comm 0x1882be50 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0xb34e3953a9afc2f1 - Init START +[OpenTinker] 2025-12-01 13:24:14,798 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpkw1j4eyo/test.c -o /tmp/tmpkw1j4eyo/test.o +[OpenTinker] 2025-12-01 13:24:14,826 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpkw1j4eyo/test.o -laio -o /tmp/tmpkw1j4eyo/a.out +[OpenTinker] 2025-12-01 13:24:14,852 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpu4eu2mm2/test.c -o /tmp/tmpu4eu2mm2/test.o +[OpenTinker] 2025-12-01 13:24:14,876 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpu4eu2mm2/test.o -laio -o /tmp/tmpu4eu2mm2/a.out +[OpenTinker] 2025-12-01 13:24:15,335 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpp2jhwke4/test.c -o /tmp/tmpp2jhwke4/test.o +[OpenTinker] 2025-12-01 13:24:15,347 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp8e96nr6y/test.c -o /tmp/tmp8e96nr6y/test.o +[OpenTinker] 2025-12-01 13:24:15,363 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpp2jhwke4/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpp2jhwke4/a.out +[OpenTinker] 2025-12-01 13:24:15,381 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp8e96nr6y/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp8e96nr6y/a.out +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO ncclCommInitRankConfig comm 0x13666ca0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0xb34e3953a9afc2f1 - Init START +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO ncclCommInitRankConfig comm 0x18bf06d0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0xb34e3953a9afc2f1 - Init START +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Bootstrap timings total 0.000750 (create 0.000021, send 0.000098, recv 0.000224, ring 0.000132, delay 0.000000) +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Bootstrap timings total 0.000796 (create 0.000021, send 0.000113, recv 0.000148, ring 0.000174, delay 0.000001) +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Bootstrap timings total 3.639589 (create 0.000020, send 0.000102, recv 3.638934, ring 0.000101, delay 0.000001) +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Bootstrap timings total 3.641966 (create 0.000021, send 0.000091, recv 3.641426, ring 0.000041, delay 0.000001) +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO comm 0x1882be50 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO comm 0x13666ca0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO comm 0x18bf06d0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO comm 0x16a41420 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1172059:1173271 [2] NCCL INFO [Proxy Service] Device 2 CPU core 16 +lshn-qs-pjul-8:1172059:1173272 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 139 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1172058:1173273 [1] NCCL INFO [Proxy Service] Device 1 CPU core 107 +lshn-qs-pjul-8:1172058:1173274 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 17 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1172060:1173275 [3] NCCL INFO [Proxy Service] Device 3 CPU core 137 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-pjul-8:1172060:1173276 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 0 +lshn-qs-pjul-8:1172057:1173277 [0] NCCL INFO [Proxy Service] Device 0 CPU core 1 +lshn-qs-pjul-8:1172057:1173278 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 6 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO ncclCommInitRankConfig comm 0x13666ca0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0xb34e3953a9afc2f1 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173264 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.58 (kernels 0.21, alloc 0.28, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.02) +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO ncclCommInitRankConfig comm 0x16a41420 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0xb34e3953a9afc2f1 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173213 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 4.24 (kernels 0.14, alloc 0.37, bootstrap 3.64, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.02) +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO ncclCommInitRankConfig comm 0x18bf06d0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0xb34e3953a9afc2f1 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173263 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.58 (kernels 0.21, alloc 0.28, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.02) +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO ncclCommInitRankConfig comm 0x1882be50 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0xb34e3953a9afc2f1 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173214 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 4.22 (kernels 0.14, alloc 0.35, bootstrap 3.64, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.04, rest 0.03) +[OpenTinker] 2025-12-01 13:24:18,716 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-01 13:24:18,736 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-01 13:24:18,736 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-01 13:24:18,737 - root - INFO - Training model with GRPO +INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-01 13:24:18 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896 +INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896 +INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896 +INFO 12-01 13:24:34 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-01 13:24:34 [__init__.py:1815] Using max model len 16896 +INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-01 13:24:35 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-01 13:24:36 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-01 13:24:37 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-01 13:24:38 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-01 13:24:38 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-01 13:24:38 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +[rank3]:[W1201 13:24:39.729535628 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +[rank2]:[W1201 13:24:39.840665733 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +[rank0]:[W1201 13:24:39.847855218 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +[rank1]:[W1201 13:24:39.880720742 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO ncclCommSplit comm 0x19c7aa30 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 1 color 2003953581 key 1- Init START +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO ncclCommSplit comm 0x155745e0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 1 color 2003953581 key 3- Init START +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO ncclCommSplit comm 0x1aca76b0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 1 color 2003953581 key 0- Init START +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO ncclCommSplit comm 0x1ba5b2b0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 1 color 2003953581 key 2- Init START +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO comm 0x1ba5b2b0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO comm 0x19c7aa30 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO comm 0x1aca76b0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO comm 0x155745e0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173418 [3] NCCL INFO [Proxy Service] Device 3 CPU core 10 +lshn-qs-pjul-8:1172060:1173419 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 11 +lshn-qs-pjul-8:1172059:1173420 [2] NCCL INFO [Proxy Service] Device 2 CPU core 114 +lshn-qs-pjul-8:1172059:1173421 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 19 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-pjul-8:1172057:1173422 [0] NCCL INFO [Proxy Service] Device 0 CPU core 116 +lshn-qs-pjul-8:1172057:1173423 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 21 +lshn-qs-pjul-8:1172058:1173424 [1] NCCL INFO [Proxy Service] Device 1 CPU core 26 +lshn-qs-pjul-8:1172058:1173425 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 28 +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO ncclCommSplit comm 0x155745e0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 1 color 2003953581 key 3 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO ncclCommSplit comm 0x1aca76b0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 1 color 2003953581 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO ncclCommSplit comm 0x19c7aa30 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 1 color 2003953581 key 1 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO ncclCommSplit comm 0x1ba5b2b0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 1 color 2003953581 key 2 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173407 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.21 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.16) +lshn-qs-pjul-8:1172057:1173414 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.04) +lshn-qs-pjul-8:1172058:1173417 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-pjul-8:1172059:1173410 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.05) +[Gloo] Rank 0 is connected to 3[Gloo] Rank 1 peer ranks. Expected number of connected peer ranks is : [Gloo] Rank 2[Gloo] Rank 3 is connected to 33 + is connected to 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : peer ranks. Expected number of connected peer ranks is : peer ranks. Expected number of connected peer ranks is : 3 +3 +3 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO ncclCommSplit comm 0x1adbb6b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 2 color 59908776 key 0- Init START +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO comm 0x1adbb6b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172057:1173448 [0] NCCL INFO [Proxy Service] Device 0 CPU core 7 +lshn-qs-pjul-8:1172057:1173449 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 34 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO ncclCommSplit comm 0x1adbb6b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 2 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173443 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO ncclCommSplit comm 0x19d8f5e0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 4 color 440515407 key 0- Init START +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO comm 0x19d8f5e0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172058:1173463 [1] NCCL INFO [Proxy Service] Device 1 CPU core 12 +lshn-qs-pjul-8:1172058:1173464 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 0 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO ncclCommSplit comm 0x19d8f5e0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 4 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173459 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO ncclCommSplit comm 0x1bb6f140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 6 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO comm 0x1bb6f140 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172059:1173478 [2] NCCL INFO [Proxy Service] Device 2 CPU core 109 +lshn-qs-pjul-8:1172059:1173479 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 99 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO ncclCommSplit comm 0x1bb6f140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 6 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173474 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO ncclCommSplit comm 0x15688960 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 8 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO comm 0x15688960 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172060:1173495 [3] NCCL INFO [Proxy Service] Device 3 CPU core 116 +lshn-qs-pjul-8:1172060:1173496 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 19 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO ncclCommSplit comm 0x15688960 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 8 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173489 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO ncclCommSplit comm 0x1c505190 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 9 color 59908776 key 0- Init START +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO comm 0x1c505190 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173501 [0] NCCL INFO [Proxy Service] Device 0 CPU core 103 +lshn-qs-pjul-8:1172057:1173505 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 131 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO ncclCommSplit comm 0x1c505190 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 9 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173494 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO ncclCommSplit comm 0x1b4ebe30 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 11 color 440515407 key 0- Init START +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO comm 0x1b4ebe30 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172058:1173519 [1] NCCL INFO [Proxy Service] Device 1 CPU core 123 +lshn-qs-pjul-8:1172058:1173520 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 130 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO ncclCommSplit comm 0x1b4ebe30 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 11 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173515 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.05, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO ncclCommSplit comm 0x1bc771b0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 13 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO comm 0x1bc771b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172059:1173534 [2] NCCL INFO [Proxy Service] Device 2 CPU core 137 +lshn-qs-pjul-8:1172059:1173535 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 14 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO ncclCommSplit comm 0x1bc771b0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 13 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173530 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO ncclCommSplit comm 0x157909d0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 15 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO comm 0x157909d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172060:1173551 [3] NCCL INFO [Proxy Service] Device 3 CPU core 99 +lshn-qs-pjul-8:1172060:1173552 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 109 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO ncclCommSplit comm 0x157909d0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 15 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173545 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO ncclCommSplit comm 0x1c60cda0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 16 color 59908776 key 0- Init START +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO comm 0x1c60cda0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172057:1173560 [0] NCCL INFO [Proxy Service] Device 0 CPU core 105 +lshn-qs-pjul-8:1172057:1173561 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 143 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO ncclCommSplit comm 0x1c60cda0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 16 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173550 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO ncclCommSplit comm 0x1b5f3a40 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 18 color 440515407 key 0- Init START +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO comm 0x1b5f3a40 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172058:1173575 [1] NCCL INFO [Proxy Service] Device 1 CPU core 42 +lshn-qs-pjul-8:1172058:1173576 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 22 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO ncclCommSplit comm 0x1b5f3a40 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 18 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173571 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO ncclCommSplit comm 0x19012070 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 20 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO comm 0x19012070 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172059:1173590 [2] NCCL INFO [Proxy Service] Device 2 CPU core 132 +lshn-qs-pjul-8:1172059:1173591 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 116 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO ncclCommSplit comm 0x19012070 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 20 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173586 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO ncclCommSplit comm 0x1b6d3200 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 22 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO comm 0x1b6d3200 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172060:1173607 [3] NCCL INFO [Proxy Service] Device 3 CPU core 120 +lshn-qs-pjul-8:1172060:1173608 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 39 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO ncclCommSplit comm 0x1b6d3200 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 22 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173601 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO ncclCommSplit comm 0x1c7149b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 23 color 59908776 key 0- Init START +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO comm 0x1c7149b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172057:1173616 [0] NCCL INFO [Proxy Service] Device 0 CPU core 25 +lshn-qs-pjul-8:1172057:1173617 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 12 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO ncclCommSplit comm 0x1c7149b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 23 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173606 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.03) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO ncclCommSplit comm 0x1b6fb650 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 25 color 440515407 key 0- Init START +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO comm 0x1b6fb650 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172058:1173631 [1] NCCL INFO [Proxy Service] Device 1 CPU core 44 +lshn-qs-pjul-8:1172058:1173632 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 0 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO ncclCommSplit comm 0x1b6fb650 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 25 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173627 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO ncclCommSplit comm 0x19119c80 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 27 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO comm 0x19119c80 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172059:1173646 [2] NCCL INFO [Proxy Service] Device 2 CPU core 16 +lshn-qs-pjul-8:1172059:1173647 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 19 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO ncclCommSplit comm 0x19119c80 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 27 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173642 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO ncclCommSplit comm 0x1b7dae10 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 29 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO comm 0x1b7dae10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172060:1173663 [3] NCCL INFO [Proxy Service] Device 3 CPU core 42 +lshn-qs-pjul-8:1172060:1173664 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 7 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO ncclCommSplit comm 0x1b7dae10 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 29 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173657 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO ncclCommSplit comm 0x1c81c5c0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 30 color 59908776 key 0- Init START +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO comm 0x1c81c5c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172057:1173672 [0] NCCL INFO [Proxy Service] Device 0 CPU core 136 +lshn-qs-pjul-8:1172057:1173673 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 105 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO ncclCommSplit comm 0x1c81c5c0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 30 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173662 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO ncclCommSplit comm 0x1b803260 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 32 color 440515407 key 0- Init START +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO comm 0x1b803260 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172058:1173687 [1] NCCL INFO [Proxy Service] Device 1 CPU core 116 +lshn-qs-pjul-8:1172058:1173688 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 124 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO ncclCommSplit comm 0x1b803260 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 32 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173683 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO ncclCommSplit comm 0x19221890 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 34 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO comm 0x19221890 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172059:1173702 [2] NCCL INFO [Proxy Service] Device 2 CPU core 3 +lshn-qs-pjul-8:1172059:1173703 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 1 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO ncclCommSplit comm 0x19221890 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 34 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173698 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Using network Socket +INFO 12-01 13:24:40 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-01 13:24:40 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-01 13:24:40 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO ncclCommSplit comm 0x1b8e2a20 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 36 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO comm 0x1b8e2a20 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1172060:1173714 [3] NCCL INFO [Proxy Service] Device 3 CPU core 23 +lshn-qs-pjul-8:1172060:1173715 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 126 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO ncclCommSplit comm 0x1b8e2a20 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 36 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173713 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +INFO 12-01 13:24:40 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-01 13:24:40 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-01 13:24:40 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-01 13:24:40 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-01 13:24:41 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-01 13:24:41 [weight_utils.py:406] No model.safetensors.index.json found in remote. +INFO 12-01 13:24:42 [weight_utils.py:406] No model.safetensors.index.json found in remote. + + Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM +INFO 12-01 13:24:56 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-01 13:24:56 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM +INFO 12-01 13:24:56 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-01 13:24:56 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173834 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1172058:1173831 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1172057:1173833 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1172060:1173832 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +[OpenTinker] 2025-12-01 13:24:57,487 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value. +lshn-qs-pjul-8:1172060:1172060 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172057:1172057 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1172058 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172059:1172059 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO ncclCommSplit comm 0x4a601550 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 37 color 2003953581 key 1- Init START +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO ncclCommSplit comm 0x4a6d8c30 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 37 color 2003953581 key 3- Init START +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO ncclCommSplit comm 0x4a267b90 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 37 color 2003953581 key 2- Init START +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO ncclCommSplit comm 0x4b5afb40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 37 color 2003953581 key 0- Init START +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO comm 0x4a267b90 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO comm 0x4b5afb40 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO comm 0x4a601550 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO comm 0x4a6d8c30 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1172058:1173849 [1] NCCL INFO [Proxy Service] Device 1 CPU core 122 +lshn-qs-pjul-8:1172059:1173850 [2] NCCL INFO [Proxy Service] Device 2 CPU core 14 +lshn-qs-pjul-8:1172058:1173851 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 127 +lshn-qs-pjul-8:1172059:1173852 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 15 +lshn-qs-pjul-8:1172060:1173853 [3] NCCL INFO [Proxy Service] Device 3 CPU core 43 +lshn-qs-pjul-8:1172060:1173854 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 44 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-pjul-8:1172057:1173855 [0] NCCL INFO [Proxy Service] Device 0 CPU core 130 +lshn-qs-pjul-8:1172057:1173856 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 133 +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO ncclCommSplit comm 0x4a601550 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x18bf06d0 splitCount 37 color 2003953581 key 1 - Init COMPLETE +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO ncclCommSplit comm 0x4a267b90 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1882be50 splitCount 37 color 2003953581 key 2 - Init COMPLETE +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO ncclCommSplit comm 0x4b5afb40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x16a41420 splitCount 37 color 2003953581 key 0 - Init COMPLETE +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO ncclCommSplit comm 0x4a6d8c30 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x13666ca0 splitCount 37 color 2003953581 key 3 - Init COMPLETE +lshn-qs-pjul-8:1172058:1173847 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-pjul-8:1172059:1173848 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-pjul-8:1172057:1173846 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-pjul-8:1172060:1173843 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-pjul-8:1172057:1173860 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1172060:1173859 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1172059:1173858 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1172058:1173857 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +INFO 12-01 13:24:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 13:24:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 13:24:58 [block_pool.py:292] Successfully reset prefix cache +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + + 0%| | 0/1024 [00:00._remove at 0x7f84695cce00> +Traceback (most recent call last): + File "/root/miniconda3/lib/python3.11/_weakrefset.py", line 39, in _remove + def _remove(item, selfref=ref(self)): + + File "/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/math_verify/utils.py", line 56, in handler + raise TimeoutException("Operation timed out!") +math_verify.errors.TimeoutException: Operation timed out! + + 4%|▍ | 44/1024 [1:50:26<39:20:17, 144.51s/it] + {'loss': 0.0589, 'grad_norm': 0.004726089537143707, 'learning_rate': 1e-05, 'num_tokens': 33522133.0, 'completions/mean_length': 4731.3515625, 'completions/min_length': 369.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4639.5986328125, 'completions/min_terminated_length': 369.0, 'completions/max_terminated_length': 14572.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019276604056358337, 'sampling/sampling_logp_difference/max': 8.773368835449219, 'sampling/importance_sampling_ratio/min': 0.0001548011932754889, 'sampling/importance_sampling_ratio/mean': 0.9999152421951294, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0001292675733566, 'clip_ratio/low_mean': 1.772546147549292e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.94652681734442e-06, 'clip_ratio/high_max': 1.578610726937768e-05, 'clip_ratio/region_mean': 2.1671988179150503e-05, 'epoch': 0.04} + + 4%|▍ | 44/1024 [1:50:26<39:20:17, 144.51s/it]INFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:15:25 [block_pool.py:292] Successfully reset prefix cache + + 4%|▍ | 45/1024 [1:52:58<39:55:33, 146.82s/it] + {'loss': 0.0202, 'grad_norm': 0.0011808272683992982, 'learning_rate': 1e-05, 'num_tokens': 34429384.0, 'completions/mean_length': 6908.8984375, 'completions/min_length': 631.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6834.29150390625, 'completions/min_terminated_length': 631.0, 'completions/max_terminated_length': 15661.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021076666191220284, 'sampling/sampling_logp_difference/max': 7.173947334289551, 'sampling/importance_sampling_ratio/min': 0.0007662919815629721, 'sampling/importance_sampling_ratio/mean': 0.9999626278877258, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0723063945770264, 'clip_ratio/low_mean': 8.259907644969644e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9861447526636766e-06, 'clip_ratio/high_max': 7.944579010654707e-06, 'clip_ratio/region_mean': 1.024605239763332e-05, 'epoch': 0.04} + + 4%|▍ | 45/1024 [1:52:58<39:55:33, 146.82s/it]INFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:17:57 [block_pool.py:292] Successfully reset prefix cache + + 4%|▍ | 46/1024 [1:55:55<42:23:12, 156.03s/it] + {'loss': 0.0433, 'grad_norm': 0.003600373398512602, 'learning_rate': 1e-05, 'num_tokens': 35302474.0, 'completions/mean_length': 6679.140625, 'completions/min_length': 828.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6446.22412109375, 'completions/min_terminated_length': 828.0, 'completions/max_terminated_length': 16348.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019808633252978325, 'sampling/sampling_logp_difference/max': 9.312483787536621, 'sampling/importance_sampling_ratio/min': 9.02900064829737e-05, 'sampling/importance_sampling_ratio/mean': 0.9998806715011597, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9413202852010727, 'clip_ratio/low_mean': 2.6357692036071967e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.222089392489579e-06, 'clip_ratio/high_max': 8.888357569958316e-06, 'clip_ratio/region_mean': 2.8579780860127357e-05, 'epoch': 0.04} + + 4%|▍ | 46/1024 [1:55:55<42:23:12, 156.03s/it]INFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:20:55 [block_pool.py:292] Successfully reset prefix cache + + 5%|▍ | 47/1024 [1:58:45<43:25:00, 159.98s/it] + {'loss': -0.0024, 'grad_norm': 0.003302425378933549, 'learning_rate': 1e-05, 'num_tokens': 36093941.0, 'completions/mean_length': 5954.5859375, 'completions/min_length': 95.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5872.46435546875, 'completions/min_terminated_length': 95.0, 'completions/max_terminated_length': 16253.0, 'rewards/accuracy_reward/mean': 0.1640625, 'rewards/accuracy_reward/std': 0.371787428855896, 'reward': 0.1640625, 'reward_std': 0.1990984082221985, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022528307512402534, 'sampling/sampling_logp_difference/max': 5.921712875366211, 'sampling/importance_sampling_ratio/min': 0.0026806045789271593, 'sampling/importance_sampling_ratio/mean': 0.9998957514762878, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.200403742492199, 'clip_ratio/low_mean': 1.6833528775350715e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3411616894009057e-06, 'clip_ratio/high_max': 9.364646757603623e-06, 'clip_ratio/region_mean': 1.9174690351064783e-05, 'epoch': 0.04} + + 5%|▍ | 47/1024 [1:58:45<43:25:00, 159.98s/it]INFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:23:44 [block_pool.py:292] Successfully reset prefix cache + + 5%|▍ | 48/1024 [2:01:27<43:34:47, 160.75s/it] + {'loss': 0.0979, 'grad_norm': 0.005992463324218988, 'learning_rate': 1e-05, 'num_tokens': 36893486.0, 'completions/mean_length': 6109.1953125, 'completions/min_length': 656.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5946.103515625, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 15867.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.40373340249061584, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018979201093316078, 'sampling/sampling_logp_difference/max': 10.624975204467773, 'sampling/importance_sampling_ratio/min': 2.4301432858919725e-05, 'sampling/importance_sampling_ratio/mean': 0.9999576807022095, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9069097489118576, 'clip_ratio/low_mean': 4.7836430894676596e-05, 'clip_ratio/low_min': 6.161485543998424e-06, 'clip_ratio/high_mean': 3.944288664570195e-06, 'clip_ratio/high_max': 1.2503618108894443e-05, 'clip_ratio/region_mean': 5.1780719331873115e-05, 'epoch': 0.04} + + 5%|▍ | 48/1024 [2:01:27<43:34:47, 160.75s/it]INFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:26:27 [block_pool.py:292] Successfully reset prefix cache + + 5%|▍ | 49/1024 [2:04:09<43:36:24, 161.01s/it] + {'loss': 0.1217, 'grad_norm': 0.005304713733494282, 'learning_rate': 1e-05, 'num_tokens': 37716027.0, 'completions/mean_length': 6265.5390625, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6022.6962890625, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15331.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29272884130477905, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019227473065257072, 'sampling/sampling_logp_difference/max': 7.968747615814209, 'sampling/importance_sampling_ratio/min': 0.0003461121814325452, 'sampling/importance_sampling_ratio/mean': 0.9998800754547119, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9107594564557076, 'clip_ratio/low_mean': 2.73638818271138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.768986860246514e-06, 'clip_ratio/high_max': 1.1075947440986056e-05, 'clip_ratio/region_mean': 3.013286891473399e-05, 'epoch': 0.05} + + 5%|▍ | 49/1024 [2:04:09<43:36:24, 161.01s/it]INFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:29:08 [block_pool.py:292] Successfully reset prefix cache + + 5%|▍ | 50/1024 [2:06:53<43:49:28, 161.98s/it] + {'loss': 0.0401, 'grad_norm': 0.0017410843865945935, 'learning_rate': 1e-05, 'num_tokens': 38519738.0, 'completions/mean_length': 6143.1796875, 'completions/min_length': 170.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5897.400390625, 'completions/min_terminated_length': 170.0, 'completions/max_terminated_length': 15860.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2301519215106964, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019512062892317772, 'sampling/sampling_logp_difference/max': 5.612663269042969, 'sampling/importance_sampling_ratio/min': 0.0036513316445052624, 'sampling/importance_sampling_ratio/mean': 0.9998773336410522, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9168931543827057, 'clip_ratio/low_mean': 3.135283236588293e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.674950448839809e-06, 'clip_ratio/high_max': 1.0917767667706357e-05, 'clip_ratio/region_mean': 3.50277827010359e-05, 'epoch': 0.05} + + 5%|▍ | 50/1024 [2:06:53<43:49:28, 161.98s/it]INFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:31:53 [block_pool.py:292] Successfully reset prefix cache + + 5%|▍ | 51/1024 [2:09:34<43:42:41, 161.73s/it] + {'loss': 0.0544, 'grad_norm': 0.004612576216459274, 'learning_rate': 1e-05, 'num_tokens': 39461012.0, 'completions/mean_length': 7165.265625, 'completions/min_length': 713.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7092.67724609375, 'completions/min_terminated_length': 713.0, 'completions/max_terminated_length': 15616.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.35505855083465576, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0201116893440485, 'sampling/sampling_logp_difference/max': 9.999534606933594, 'sampling/importance_sampling_ratio/min': 4.5421067625284195e-05, 'sampling/importance_sampling_ratio/mean': 1.0000245571136475, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9690218195319176, 'clip_ratio/low_mean': 2.6178069106208568e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7445629459398333e-06, 'clip_ratio/high_max': 5.4981305765977595e-06, 'clip_ratio/region_mean': 2.99226320521484e-05, 'epoch': 0.05} + + 5%|▍ | 51/1024 [2:09:34<43:42:41, 161.73s/it]INFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:34:34 [block_pool.py:292] Successfully reset prefix cache + + 5%|▌ | 52/1024 [2:12:09<43:06:30, 159.66s/it] + {'loss': -0.0235, 'grad_norm': 0.003172830445691943, 'learning_rate': 1e-05, 'num_tokens': 40202979.0, 'completions/mean_length': 5617.9296875, 'completions/min_length': 162.0, 'completions/max_length': 16007.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5617.9296875, 'completions/min_terminated_length': 162.0, 'completions/max_terminated_length': 16007.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020904643461108208, 'sampling/sampling_logp_difference/max': 13.609129905700684, 'sampling/importance_sampling_ratio/min': 1.229221084031451e-06, 'sampling/importance_sampling_ratio/mean': 0.9999560117721558, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0479632839560509, 'clip_ratio/low_mean': 2.1866131419301382e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2383335906160937e-06, 'clip_ratio/high_max': 1.2953334362464375e-05, 'clip_ratio/region_mean': 2.5104465066760895e-05, 'epoch': 0.05} + + 5%|▌ | 52/1024 [2:12:09<43:06:30, 159.66s/it]INFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:37:09 [block_pool.py:292] Successfully reset prefix cache + + 5%|▌ | 53/1024 [2:14:34<41:51:45, 155.21s/it] + {'loss': 0.0336, 'grad_norm': 0.003333345288410783, 'learning_rate': 1e-05, 'num_tokens': 40989532.0, 'completions/mean_length': 5995.3203125, 'completions/min_length': 397.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5913.51953125, 'completions/min_terminated_length': 397.0, 'completions/max_terminated_length': 16094.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021745413541793823, 'sampling/sampling_logp_difference/max': 9.405362129211426, 'sampling/importance_sampling_ratio/min': 8.228168007917702e-05, 'sampling/importance_sampling_ratio/mean': 0.9999282360076904, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.022934041917324, 'clip_ratio/low_mean': 4.556761541607557e-05, 'clip_ratio/low_min': 8.631802302261349e-06, 'clip_ratio/high_mean': 4.841006557398941e-06, 'clip_ratio/high_max': 1.4129082956060302e-05, 'clip_ratio/region_mean': 5.040862197347451e-05, 'epoch': 0.05} + + 5%|▌ | 53/1024 [2:14:34<41:51:45, 155.21s/it]INFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:39:33 [block_pool.py:292] Successfully reset prefix cache + + 5%|▌ | 54/1024 [2:17:22<42:53:37, 159.19s/it] + {'loss': 0.0799, 'grad_norm': 0.005538261961191893, 'learning_rate': 1e-05, 'num_tokens': 41813914.0, 'completions/mean_length': 6297.859375, 'completions/min_length': 1243.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6055.79248046875, 'completions/min_terminated_length': 1243.0, 'completions/max_terminated_length': 15648.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019708994776010513, 'sampling/sampling_logp_difference/max': 8.659659385681152, 'sampling/importance_sampling_ratio/min': 0.00017344337538816035, 'sampling/importance_sampling_ratio/mean': 0.9999532699584961, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9511058703064919, 'clip_ratio/low_mean': 3.960530659696815e-05, 'clip_ratio/low_min': 3.4269107800355414e-06, 'clip_ratio/high_mean': 6.531613848892448e-06, 'clip_ratio/high_max': 2.286436574649997e-05, 'clip_ratio/region_mean': 4.6136920445860596e-05, 'epoch': 0.05} + + 5%|▌ | 54/1024 [2:17:22<42:53:37, 159.19s/it]INFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:42:22 [block_pool.py:292] Successfully reset prefix cache + + 5%|▌ | 55/1024 [2:19:26<40:01:09, 148.68s/it] + {'loss': -0.0177, 'grad_norm': 0.0024318129289895296, 'learning_rate': 1e-05, 'num_tokens': 42443288.0, 'completions/mean_length': 4765.046875, 'completions/min_length': 401.0, 'completions/max_length': 14051.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4765.046875, 'completions/min_terminated_length': 401.0, 'completions/max_terminated_length': 14051.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29196253418922424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01920286938548088, 'sampling/sampling_logp_difference/max': 9.175529479980469, 'sampling/importance_sampling_ratio/min': 0.0001035423920257017, 'sampling/importance_sampling_ratio/mean': 0.9999518394470215, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9130316227674484, 'clip_ratio/low_mean': 2.561447990956367e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.143934355241072e-06, 'clip_ratio/high_max': 4.575737420964288e-06, 'clip_ratio/region_mean': 2.6758414151117904e-05, 'epoch': 0.05} + + 5%|▌ | 55/1024 [2:19:26<40:01:09, 148.68s/it]INFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:44:26 [block_pool.py:292] Successfully reset prefix cache + + 5%|▌ | 56/1024 [2:22:27<42:32:06, 158.19s/it] + {'loss': -0.0036, 'grad_norm': 0.0018957280553877354, 'learning_rate': 1e-05, 'num_tokens': 43287600.0, 'completions/mean_length': 6411.5, 'completions/min_length': 321.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5834.578125, 'completions/min_terminated_length': 321.0, 'completions/max_terminated_length': 15445.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.1990983933210373, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018554572016000748, 'sampling/sampling_logp_difference/max': 6.124218463897705, 'sampling/importance_sampling_ratio/min': 0.0021892013028264046, 'sampling/importance_sampling_ratio/mean': 0.9999212622642517, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8110766112804413, 'clip_ratio/low_mean': 4.221943618176738e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.771039933373686e-06, 'clip_ratio/high_max': 7.084159733494744e-06, 'clip_ratio/region_mean': 4.3990476115141064e-05, 'epoch': 0.05} + + 5%|▌ | 56/1024 [2:22:27<42:32:06, 158.19s/it]INFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:47:26 [block_pool.py:292] Successfully reset prefix cache + + 6%|▌ | 57/1024 [2:25:02<42:14:27, 157.26s/it] + {'loss': 0.0274, 'grad_norm': 0.002431448083370924, 'learning_rate': 1e-05, 'num_tokens': 44145524.0, 'completions/mean_length': 6552.40625, 'completions/min_length': 348.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6235.2578125, 'completions/min_terminated_length': 348.0, 'completions/max_terminated_length': 15508.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.3114011883735657, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020403606817126274, 'sampling/sampling_logp_difference/max': 2.974147081375122, 'sampling/importance_sampling_ratio/min': 0.051090992987155914, 'sampling/importance_sampling_ratio/mean': 0.999876081943512, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0034996420145035, 'clip_ratio/low_mean': 4.334260950145108e-05, 'clip_ratio/low_min': 8.570448699174449e-06, 'clip_ratio/high_mean': 1.6897372461244231e-06, 'clip_ratio/high_max': 6.7589489844976924e-06, 'clip_ratio/region_mean': 4.503234697494918e-05, 'epoch': 0.05} + + 6%|▌ | 57/1024 [2:25:02<42:14:27, 157.26s/it]INFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:50:01 [block_pool.py:292] Successfully reset prefix cache + + 6%|▌ | 58/1024 [2:27:19<40:33:14, 151.13s/it] + {'loss': 0.0344, 'grad_norm': 0.004493447951972485, 'learning_rate': 1e-05, 'num_tokens': 44763895.0, 'completions/mean_length': 4688.7734375, 'completions/min_length': 345.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4408.08837890625, 'completions/min_terminated_length': 345.0, 'completions/max_terminated_length': 13257.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.26196980476379395, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01916680857539177, 'sampling/sampling_logp_difference/max': 10.364669799804688, 'sampling/importance_sampling_ratio/min': 3.1526888051303104e-05, 'sampling/importance_sampling_ratio/mean': 0.9999460577964783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9620971381664276, 'clip_ratio/low_mean': 1.0045687076853937e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.422987098630983e-06, 'clip_ratio/high_max': 2.1032463337178342e-05, 'clip_ratio/region_mean': 1.646867417548492e-05, 'epoch': 0.05} + + 6%|▌ | 58/1024 [2:27:19<40:33:14, 151.13s/it]INFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:52:18 [block_pool.py:292] Successfully reset prefix cache + + 6%|▌ | 59/1024 [2:29:44<40:01:00, 149.29s/it] + {'loss': 0.0813, 'grad_norm': 0.0049595762975513935, 'learning_rate': 1e-05, 'num_tokens': 45470335.0, 'completions/mean_length': 5381.1875, 'completions/min_length': 25.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5294.55126953125, 'completions/min_terminated_length': 25.0, 'completions/max_terminated_length': 14591.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3090519607067108, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020656142383813858, 'sampling/sampling_logp_difference/max': 15.624994277954102, 'sampling/importance_sampling_ratio/min': 1.6373864752949885e-07, 'sampling/importance_sampling_ratio/mean': 0.9998573660850525, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0265433564782143, 'clip_ratio/low_mean': 2.8500278403953416e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.691486530347902e-06, 'clip_ratio/high_max': 3.076594612139161e-05, 'clip_ratio/region_mean': 3.619176493430132e-05, 'epoch': 0.05} + + 6%|▌ | 59/1024 [2:29:44<40:01:00, 149.29s/it]INFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:54:43 [block_pool.py:292] Successfully reset prefix cache + + 6%|▌ | 60/1024 [2:32:13<39:57:14, 149.21s/it] + {'loss': 0.068, 'grad_norm': 0.00655899103730917, 'learning_rate': 1e-05, 'num_tokens': 46206971.0, 'completions/mean_length': 5613.84375, 'completions/min_length': 55.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5529.03955078125, 'completions/min_terminated_length': 55.0, 'completions/max_terminated_length': 15006.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3090519607067108, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020495962351560593, 'sampling/sampling_logp_difference/max': 3.4162673950195312, 'sampling/importance_sampling_ratio/min': 0.03283476456999779, 'sampling/importance_sampling_ratio/mean': 0.999952495098114, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0289503335952759, 'clip_ratio/low_mean': 3.143254116366734e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.581610250577796e-06, 'clip_ratio/high_max': 2.6326441002311185e-05, 'clip_ratio/region_mean': 3.8014151868992485e-05, 'epoch': 0.06} + + 6%|▌ | 60/1024 [2:32:13<39:57:14, 149.21s/it]INFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:57:12 [block_pool.py:292] Successfully reset prefix cache + + 6%|▌ | 61/1024 [2:34:47<40:20:55, 150.84s/it] + {'loss': 0.0459, 'grad_norm': 0.007459669373929501, 'learning_rate': 1e-05, 'num_tokens': 46940112.0, 'completions/mean_length': 5577.2890625, 'completions/min_length': 784.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5492.19677734375, 'completions/min_terminated_length': 784.0, 'completions/max_terminated_length': 14763.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.39082521200180054, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018994126468896866, 'sampling/sampling_logp_difference/max': 14.014364242553711, 'sampling/importance_sampling_ratio/min': 8.196697649509588e-07, 'sampling/importance_sampling_ratio/mean': 1.0000065565109253, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9836367890238762, 'clip_ratio/low_mean': 3.3687326776998816e-05, 'clip_ratio/low_min': 5.745277576352237e-06, 'clip_ratio/high_mean': 8.083893476396042e-06, 'clip_ratio/high_max': 3.233557390558417e-05, 'clip_ratio/region_mean': 4.1771219912334345e-05, 'epoch': 0.06} + + 6%|▌ | 61/1024 [2:34:47<40:20:55, 150.84s/it]INFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 15:59:47 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-01 16:01:44,103 - math_verify.grader - WARNING - Timeout during comparison + + 6%|▌ | 62/1024 [2:37:45<42:28:16, 158.94s/it] + {'loss': -0.0013, 'grad_norm': 0.005132914055138826, 'learning_rate': 1e-05, 'num_tokens': 47796514.0, 'completions/mean_length': 6547.140625, 'completions/min_length': 266.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6311.05615234375, 'completions/min_terminated_length': 266.0, 'completions/max_terminated_length': 16273.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2751026153564453, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02021491341292858, 'sampling/sampling_logp_difference/max': 7.597993850708008, 'sampling/importance_sampling_ratio/min': 0.0005014563794247806, 'sampling/importance_sampling_ratio/mean': 0.999970018863678, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9028418883681297, 'clip_ratio/low_mean': 3.032099141364597e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.300606747165148e-06, 'clip_ratio/high_max': 1.720242698866059e-05, 'clip_ratio/region_mean': 3.462159838818479e-05, 'epoch': 0.06} + + 6%|▌ | 62/1024 [2:37:45<42:28:16, 158.94s/it]INFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:02:45 [block_pool.py:292] Successfully reset prefix cache + + 6%|▌ | 63/1024 [2:41:06<45:48:17, 171.59s/it] + {'loss': 0.0196, 'grad_norm': 0.0034147046972066164, 'learning_rate': 1e-05, 'num_tokens': 48765386.0, 'completions/mean_length': 7409.3125, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6811.00048828125, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.27198708057403564, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01943383738398552, 'sampling/sampling_logp_difference/max': 12.379810333251953, 'sampling/importance_sampling_ratio/min': 4.202586751489434e-06, 'sampling/importance_sampling_ratio/mean': 0.9998997449874878, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8614663332700729, 'clip_ratio/low_mean': 2.838153790207798e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.695532941743295e-06, 'clip_ratio/high_max': 1.078213176697318e-05, 'clip_ratio/region_mean': 3.1077070843821275e-05, 'epoch': 0.06} + + 6%|▌ | 63/1024 [2:41:06<45:48:17, 171.59s/it]INFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:06:06 [block_pool.py:292] Successfully reset prefix cache + + 6%|▋ | 64/1024 [2:43:38<44:11:06, 165.69s/it] + {'loss': 0.0371, 'grad_norm': 0.004101228900253773, 'learning_rate': 1e-05, 'num_tokens': 49606280.0, 'completions/mean_length': 6420.859375, 'completions/min_length': 273.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6181.744140625, 'completions/min_terminated_length': 273.0, 'completions/max_terminated_length': 14591.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01929381489753723, 'sampling/sampling_logp_difference/max': 8.258644104003906, 'sampling/importance_sampling_ratio/min': 0.000259009946603328, 'sampling/importance_sampling_ratio/mean': 1.0000226497650146, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9671022593975067, 'clip_ratio/low_mean': 3.695166174111364e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8833828764618374e-06, 'clip_ratio/high_max': 1.153353150584735e-05, 'clip_ratio/region_mean': 3.98350443902018e-05, 'epoch': 0.06} + + 6%|▋ | 64/1024 [2:43:38<44:11:06, 165.69s/it]INFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:08:41 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 6%|▋ | 65/1024 [2:46:04<42:31:38, 159.64s/it] + {'loss': 0.0601, 'grad_norm': 0.0077895247377455235, 'learning_rate': 1e-05, 'num_tokens': 50246457.0, 'completions/mean_length': 4852.7578125, 'completions/min_length': 92.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4761.96044921875, 'completions/min_terminated_length': 92.0, 'completions/max_terminated_length': 14971.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.35400262475013733, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01895500347018242, 'sampling/sampling_logp_difference/max': 10.624988555908203, 'sampling/importance_sampling_ratio/min': 2.4301109078805894e-05, 'sampling/importance_sampling_ratio/mean': 0.9999773502349854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9933939427137375, 'clip_ratio/low_mean': 4.231768923546042e-05, 'clip_ratio/low_min': 5.164009053260088e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.231768923546042e-05, 'epoch': 0.06} + + 6%|▋ | 65/1024 [2:46:04<42:31:38, 159.64s/it]INFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:11:03 [block_pool.py:292] Successfully reset prefix cache + + 6%|▋ | 66/1024 [2:48:45<42:38:52, 160.26s/it] + {'loss': 0.0534, 'grad_norm': 0.00207411777228117, 'learning_rate': 1e-05, 'num_tokens': 51141597.0, 'completions/mean_length': 6840.03125, 'completions/min_length': 728.0, 'completions/max_length': 15610.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6840.03125, 'completions/min_terminated_length': 728.0, 'completions/max_terminated_length': 15610.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2790592312812805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02091015875339508, 'sampling/sampling_logp_difference/max': 15.411253929138184, 'sampling/importance_sampling_ratio/min': 2.0275774659239687e-07, 'sampling/importance_sampling_ratio/mean': 0.9999240636825562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9959733113646507, 'clip_ratio/low_mean': 3.009997408298659e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.009997408298659e-05, 'epoch': 0.06} + + 6%|▋ | 66/1024 [2:48:45<42:38:52, 160.26s/it]INFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:13:45 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 67/1024 [2:51:17<41:53:20, 157.58s/it] + {'loss': 0.0238, 'grad_norm': 0.006496666464954615, 'learning_rate': 1e-05, 'num_tokens': 52001758.0, 'completions/mean_length': 6567.3828125, 'completions/min_length': 234.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6331.7841796875, 'completions/min_terminated_length': 234.0, 'completions/max_terminated_length': 15249.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021580250933766365, 'sampling/sampling_logp_difference/max': 5.936847686767578, 'sampling/importance_sampling_ratio/min': 0.0026403397787362337, 'sampling/importance_sampling_ratio/mean': 0.9999523162841797, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0921807065606117, 'clip_ratio/low_mean': 4.6152885829542356e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.853683203189576e-06, 'clip_ratio/high_max': 2.297391938554938e-05, 'clip_ratio/region_mean': 5.3006569942226633e-05, 'epoch': 0.06} + + 7%|▋ | 67/1024 [2:51:17<41:53:20, 157.58s/it]INFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:16:16 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 68/1024 [2:54:24<44:14:08, 166.58s/it] + {'loss': 0.021, 'grad_norm': 0.002272722776979208, 'learning_rate': 1e-05, 'num_tokens': 52907256.0, 'completions/mean_length': 6927.265625, 'completions/min_length': 781.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6542.84521484375, 'completions/min_terminated_length': 781.0, 'completions/max_terminated_length': 16336.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.22673700749874115, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01844738982617855, 'sampling/sampling_logp_difference/max': 16.51754379272461, 'sampling/importance_sampling_ratio/min': 6.70690099013882e-08, 'sampling/importance_sampling_ratio/mean': 0.9999938011169434, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8170016556978226, 'clip_ratio/low_mean': 1.7558751551405294e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0726623663213104e-06, 'clip_ratio/high_max': 1.2290649465285242e-05, 'clip_ratio/region_mean': 2.0631413917726604e-05, 'epoch': 0.06} + + 7%|▋ | 68/1024 [2:54:24<44:14:08, 166.58s/it]INFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:19:24 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 69/1024 [2:56:59<43:14:08, 162.98s/it] + {'loss': 0.0382, 'grad_norm': 0.005651532672345638, 'learning_rate': 1e-05, 'num_tokens': 53682100.0, 'completions/mean_length': 5889.28125, 'completions/min_length': 260.0, 'completions/max_length': 16228.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5889.28125, 'completions/min_terminated_length': 260.0, 'completions/max_terminated_length': 16228.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.32613158226013184, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020069826394319534, 'sampling/sampling_logp_difference/max': 14.67677116394043, 'sampling/importance_sampling_ratio/min': 4.226289718189946e-07, 'sampling/importance_sampling_ratio/mean': 0.9998855590820312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0794919431209564, 'clip_ratio/low_mean': 5.522496246612718e-05, 'clip_ratio/low_min': 4.129910394112812e-06, 'clip_ratio/high_mean': 4.526967131823767e-06, 'clip_ratio/high_max': 1.016177520796191e-05, 'clip_ratio/region_mean': 5.9751928688456246e-05, 'epoch': 0.06} + + 7%|▋ | 69/1024 [2:56:59<43:14:08, 162.98s/it]INFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:21:59 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 70/1024 [2:59:38<42:52:52, 161.82s/it] + {'loss': 0.0246, 'grad_norm': 0.002985857194289565, 'learning_rate': 1e-05, 'num_tokens': 54456508.0, 'completions/mean_length': 5909.3125, 'completions/min_length': 197.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5394.16357421875, 'completions/min_terminated_length': 197.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01927822455763817, 'sampling/sampling_logp_difference/max': 7.699061393737793, 'sampling/importance_sampling_ratio/min': 0.000453252432635054, 'sampling/importance_sampling_ratio/mean': 0.999995231628418, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8462172821164131, 'clip_ratio/low_mean': 4.575056436806335e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4492417221845244e-06, 'clip_ratio/high_max': 5.796966888738098e-06, 'clip_ratio/region_mean': 4.719980597656104e-05, 'epoch': 0.06} + + 7%|▋ | 70/1024 [2:59:38<42:52:52, 161.82s/it]INFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:24:38 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 71/1024 [3:02:39<44:23:22, 167.68s/it] + {'loss': 0.0218, 'grad_norm': 0.0036494233645498753, 'learning_rate': 1e-05, 'num_tokens': 55429663.0, 'completions/mean_length': 7465.3984375, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7177.701171875, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15579.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01909823715686798, 'sampling/sampling_logp_difference/max': 6.343155384063721, 'sampling/importance_sampling_ratio/min': 0.0017587440088391304, 'sampling/importance_sampling_ratio/mean': 0.9998987913131714, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8792542889714241, 'clip_ratio/low_mean': 3.1553636290482245e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.821615673085034e-06, 'clip_ratio/high_max': 1.8927265045931563e-05, 'clip_ratio/region_mean': 3.737525207725412e-05, 'epoch': 0.07} + + 7%|▋ | 71/1024 [3:02:39<44:23:22, 167.68s/it]INFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:27:39 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 72/1024 [3:05:16<43:27:18, 164.33s/it] + {'loss': 0.0295, 'grad_norm': 0.003951186314225197, 'learning_rate': 1e-05, 'num_tokens': 56173314.0, 'completions/mean_length': 5674.9609375, 'completions/min_length': 71.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5590.6376953125, 'completions/min_terminated_length': 71.0, 'completions/max_terminated_length': 15670.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.29249149560928345, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01932360976934433, 'sampling/sampling_logp_difference/max': 5.742441177368164, 'sampling/importance_sampling_ratio/min': 0.003206930123269558, 'sampling/importance_sampling_ratio/mean': 0.9999845623970032, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9117730036377907, 'clip_ratio/low_mean': 3.611839565564878e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1965249743516324e-06, 'clip_ratio/high_max': 8.78609989740653e-06, 'clip_ratio/region_mean': 3.831492040262674e-05, 'epoch': 0.07} + + 7%|▋ | 72/1024 [3:05:16<43:27:18, 164.33s/it]INFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:30:16 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 73/1024 [3:07:27<40:47:22, 154.41s/it] + {'loss': 0.0328, 'grad_norm': 0.005329386796802282, 'learning_rate': 1e-05, 'num_tokens': 56799911.0, 'completions/mean_length': 4754.5390625, 'completions/min_length': 291.0, 'completions/max_length': 16325.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4754.5390625, 'completions/min_terminated_length': 291.0, 'completions/max_terminated_length': 16325.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.4111049771308899, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.01792578026652336, 'sampling/sampling_logp_difference/max': 9.36398696899414, 'sampling/importance_sampling_ratio/min': 8.575750689487904e-05, 'sampling/importance_sampling_ratio/mean': 0.9999337196350098, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8350499644875526, 'clip_ratio/low_mean': 4.657158876852918e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.872955512131739e-06, 'clip_ratio/high_max': 1.7587798083695816e-05, 'clip_ratio/region_mean': 5.244454393960041e-05, 'epoch': 0.07} + + 7%|▋ | 73/1024 [3:07:27<40:47:22, 154.41s/it]INFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:32:27 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 74/1024 [3:10:06<41:04:54, 155.68s/it] + {'loss': 0.082, 'grad_norm': 0.0036763548851013184, 'learning_rate': 1e-05, 'num_tokens': 57553986.0, 'completions/mean_length': 5744.2734375, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5488.92041015625, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16316.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018098725005984306, 'sampling/sampling_logp_difference/max': 9.082645416259766, 'sampling/importance_sampling_ratio/min': 0.00011362064105924219, 'sampling/importance_sampling_ratio/mean': 0.9999231696128845, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8065197095274925, 'clip_ratio/low_mean': 1.8536085917730816e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1438435144082177e-06, 'clip_ratio/high_max': 1.2575374057632871e-05, 'clip_ratio/region_mean': 2.1679929204765358e-05, 'epoch': 0.07} + + 7%|▋ | 74/1024 [3:10:06<41:04:54, 155.68s/it]INFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:35:05 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 75/1024 [3:12:46<41:24:05, 157.06s/it] + {'loss': 0.0139, 'grad_norm': 0.0038320303428918123, 'learning_rate': 1e-05, 'num_tokens': 58438333.0, 'completions/mean_length': 6754.5234375, 'completions/min_length': 638.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6523.41650390625, 'completions/min_terminated_length': 638.0, 'completions/max_terminated_length': 16088.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2369818389415741, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02173798717558384, 'sampling/sampling_logp_difference/max': 12.989178657531738, 'sampling/importance_sampling_ratio/min': 2.284922175022075e-06, 'sampling/importance_sampling_ratio/mean': 0.9999582767486572, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.013127624988556, 'clip_ratio/low_mean': 2.6290458890798618e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.101248914092139e-06, 'clip_ratio/high_max': 1.877081149359583e-05, 'clip_ratio/region_mean': 3.239170769120392e-05, 'epoch': 0.07} + + 7%|▋ | 75/1024 [3:12:46<41:24:05, 157.06s/it]INFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:37:46 [block_pool.py:292] Successfully reset prefix cache + + 7%|▋ | 76/1024 [3:15:20<41:07:09, 156.15s/it] + {'loss': 0.0483, 'grad_norm': 0.004985450301319361, 'learning_rate': 1e-05, 'num_tokens': 59249562.0, 'completions/mean_length': 6203.5390625, 'completions/min_length': 408.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6123.3779296875, 'completions/min_terminated_length': 408.0, 'completions/max_terminated_length': 12421.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019999932497739792, 'sampling/sampling_logp_difference/max': 5.3917694091796875, 'sampling/importance_sampling_ratio/min': 0.004553908482193947, 'sampling/importance_sampling_ratio/mean': 0.9999778270721436, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0302691981196404, 'clip_ratio/low_mean': 3.252214798976638e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.682960474790889e-06, 'clip_ratio/high_max': 1.9026635982299922e-05, 'clip_ratio/region_mean': 3.920510800980992e-05, 'epoch': 0.07} + + 7%|▋ | 76/1024 [3:15:20<41:07:09, 156.15s/it]INFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:40:20 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 77/1024 [3:17:44<40:08:15, 152.58s/it] + {'loss': 0.0236, 'grad_norm': 0.0037541294004768133, 'learning_rate': 1e-05, 'num_tokens': 60001208.0, 'completions/mean_length': 5727.796875, 'completions/min_length': 743.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5643.8896484375, 'completions/min_terminated_length': 743.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.20753079652786255, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020555900409817696, 'sampling/sampling_logp_difference/max': 8.400880813598633, 'sampling/importance_sampling_ratio/min': 0.00022466933296527714, 'sampling/importance_sampling_ratio/mean': 0.9999213218688965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9781062752008438, 'clip_ratio/low_mean': 3.63567767180939e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4910855220005033e-06, 'clip_ratio/high_max': 1.3964342088002013e-05, 'clip_ratio/region_mean': 3.984786212640756e-05, 'epoch': 0.07} + + 8%|▊ | 77/1024 [3:17:44<40:08:15, 152.58s/it]INFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:42:44 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 78/1024 [3:20:36<41:33:37, 158.16s/it] + {'loss': 0.0754, 'grad_norm': 0.007178841158747673, 'learning_rate': 1e-05, 'num_tokens': 60777899.0, 'completions/mean_length': 5923.8359375, 'completions/min_length': 597.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5409.4013671875, 'completions/min_terminated_length': 597.0, 'completions/max_terminated_length': 15720.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2977364659309387, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019868161529302597, 'sampling/sampling_logp_difference/max': 7.621582508087158, 'sampling/importance_sampling_ratio/min': 0.0004897661856375635, 'sampling/importance_sampling_ratio/mean': 0.9999773502349854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9449758678674698, 'clip_ratio/low_mean': 3.516969627526123e-05, 'clip_ratio/low_min': 4.025116595585132e-06, 'clip_ratio/high_mean': 6.949231874386896e-07, 'clip_ratio/high_max': 2.7796927497547586e-06, 'clip_ratio/region_mean': 3.586461934901308e-05, 'epoch': 0.07} + + 8%|▊ | 78/1024 [3:20:36<41:33:37, 158.16s/it]INFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:45:35 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 79/1024 [3:23:03<40:38:16, 154.81s/it] + {'loss': 0.0136, 'grad_norm': 0.004776299465447664, 'learning_rate': 1e-05, 'num_tokens': 61587141.0, 'completions/mean_length': 6171.640625, 'completions/min_length': 721.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5926.54443359375, 'completions/min_terminated_length': 721.0, 'completions/max_terminated_length': 14267.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.36113685369491577, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019484341144561768, 'sampling/sampling_logp_difference/max': 10.124996185302734, 'sampling/importance_sampling_ratio/min': 4.0065449866233394e-05, 'sampling/importance_sampling_ratio/mean': 0.999945878982544, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8597526922821999, 'clip_ratio/low_mean': 4.3257180891487224e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.016423746288638e-06, 'clip_ratio/high_max': 2.7642782697512303e-05, 'clip_ratio/region_mean': 5.227360486514954e-05, 'epoch': 0.07} + + 8%|▊ | 79/1024 [3:23:03<40:38:16, 154.81s/it]INFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:48:02 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 80/1024 [3:25:28<39:52:32, 152.07s/it] + {'loss': 0.0539, 'grad_norm': 0.007431659381836653, 'learning_rate': 1e-05, 'num_tokens': 62308321.0, 'completions/mean_length': 5501.59375, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5415.9052734375, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 15310.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.400318443775177, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019636545330286026, 'sampling/sampling_logp_difference/max': 9.999296188354492, 'sampling/importance_sampling_ratio/min': 4.54318942502141e-05, 'sampling/importance_sampling_ratio/mean': 1.0000393390655518, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9705724790692329, 'clip_ratio/low_mean': 3.6077020070024446e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.036492244566034e-05, 'clip_ratio/high_max': 4.145968978264136e-05, 'clip_ratio/region_mean': 4.644194200409402e-05, 'epoch': 0.07} + + 8%|▊ | 80/1024 [3:25:28<39:52:32, 152.07s/it]INFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:50:28 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 81/1024 [3:27:55<39:25:09, 150.49s/it] + {'loss': 0.0797, 'grad_norm': 0.005465450696647167, 'learning_rate': 1e-05, 'num_tokens': 63084113.0, 'completions/mean_length': 5908.125, 'completions/min_length': 504.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5825.6376953125, 'completions/min_terminated_length': 504.0, 'completions/max_terminated_length': 15781.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.39400771260261536, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018073562532663345, 'sampling/sampling_logp_difference/max': 9.951221466064453, 'sampling/importance_sampling_ratio/min': 4.766937126987614e-05, 'sampling/importance_sampling_ratio/mean': 0.9999576210975647, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8575867265462875, 'clip_ratio/low_mean': 6.429905033655814e-05, 'clip_ratio/low_min': 6.3626184783061035e-06, 'clip_ratio/high_mean': 1.081801542568428e-06, 'clip_ratio/high_max': 4.327206170273712e-06, 'clip_ratio/region_mean': 6.538085153806605e-05, 'epoch': 0.07} + + 8%|▊ | 81/1024 [3:27:55<39:25:09, 150.49s/it]INFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:52:55 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 82/1024 [3:30:19<38:49:45, 148.39s/it] + {'loss': -0.0104, 'grad_norm': 0.003077819012105465, 'learning_rate': 1e-05, 'num_tokens': 63740015.0, 'completions/mean_length': 4906.734375, 'completions/min_length': 108.0, 'completions/max_length': 15981.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4906.734375, 'completions/min_terminated_length': 108.0, 'completions/max_terminated_length': 15981.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2251344621181488, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01949312724173069, 'sampling/sampling_logp_difference/max': 9.879111289978027, 'sampling/importance_sampling_ratio/min': 5.1233790145488456e-05, 'sampling/importance_sampling_ratio/mean': 1.000091791152954, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9647495672106743, 'clip_ratio/low_mean': 3.040744320514932e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6878207134141121e-06, 'clip_ratio/high_max': 6.7512828536564484e-06, 'clip_ratio/region_mean': 3.209526391856343e-05, 'epoch': 0.08} + + 8%|▊ | 82/1024 [3:30:19<38:49:45, 148.39s/it]INFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:55:18 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 83/1024 [3:32:31<37:31:47, 143.58s/it] + {'loss': 0.0364, 'grad_norm': 0.0018245981773361564, 'learning_rate': 1e-05, 'num_tokens': 64450515.0, 'completions/mean_length': 5402.78125, 'completions/min_length': 277.0, 'completions/max_length': 15716.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5402.78125, 'completions/min_terminated_length': 277.0, 'completions/max_terminated_length': 15716.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019615523517131805, 'sampling/sampling_logp_difference/max': 6.93695592880249, 'sampling/importance_sampling_ratio/min': 0.0009712215978652239, 'sampling/importance_sampling_ratio/mean': 0.9999257922172546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9809223562479019, 'clip_ratio/low_mean': 3.626802561029763e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8155938050767872e-06, 'clip_ratio/high_max': 7.262375220307149e-06, 'clip_ratio/region_mean': 3.8083618960627064e-05, 'epoch': 0.08} + + 8%|▊ | 83/1024 [3:32:31<37:31:47, 143.58s/it]INFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 16:57:30 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 84/1024 [3:35:26<39:57:20, 153.02s/it] + {'loss': 0.0645, 'grad_norm': 0.006053395569324493, 'learning_rate': 1e-05, 'num_tokens': 65269285.0, 'completions/mean_length': 6198.703125, 'completions/min_length': 265.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5870.14501953125, 'completions/min_terminated_length': 265.0, 'completions/max_terminated_length': 16329.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3464113473892212, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01869945600628853, 'sampling/sampling_logp_difference/max': 6.874996662139893, 'sampling/importance_sampling_ratio/min': 0.0010333011159673333, 'sampling/importance_sampling_ratio/mean': 0.9999875426292419, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8571672514081001, 'clip_ratio/low_mean': 4.734331901090627e-05, 'clip_ratio/low_min': 1.1585900665522786e-05, 'clip_ratio/high_mean': 2.9435553301482287e-06, 'clip_ratio/high_max': 1.1774221320592915e-05, 'clip_ratio/region_mean': 5.0286874625271594e-05, 'epoch': 0.08} + + 8%|▊ | 84/1024 [3:35:26<39:57:20, 153.02s/it]INFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:00:26 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 85/1024 [3:38:03<40:12:00, 154.12s/it] + {'loss': 0.0681, 'grad_norm': 0.0030623299535363913, 'learning_rate': 1e-05, 'num_tokens': 66058473.0, 'completions/mean_length': 6016.09375, 'completions/min_length': 370.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5851.52392578125, 'completions/min_terminated_length': 370.0, 'completions/max_terminated_length': 15972.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.24883407354354858, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02085939608514309, 'sampling/sampling_logp_difference/max': 6.4199748039245605, 'sampling/importance_sampling_ratio/min': 0.0016286972677335143, 'sampling/importance_sampling_ratio/mean': 0.9999305009841919, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9883866459131241, 'clip_ratio/low_mean': 3.2358174394175876e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.78695198278001e-06, 'clip_ratio/high_max': 2.7282983865006827e-05, 'clip_ratio/region_mean': 4.0145126376955886e-05, 'epoch': 0.08} + + 8%|▊ | 85/1024 [3:38:03<40:12:00, 154.12s/it]INFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:03:02 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 86/1024 [3:40:50<41:13:05, 158.19s/it] + {'loss': 0.0389, 'grad_norm': 0.0038264680188149214, 'learning_rate': 1e-05, 'num_tokens': 66984285.0, 'completions/mean_length': 7072.53125, 'completions/min_length': 48.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6924.73046875, 'completions/min_terminated_length': 48.0, 'completions/max_terminated_length': 15594.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021116644144058228, 'sampling/sampling_logp_difference/max': 6.17248010635376, 'sampling/importance_sampling_ratio/min': 0.0020860559307038784, 'sampling/importance_sampling_ratio/mean': 0.9999492764472961, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0157204791903496, 'clip_ratio/low_mean': 3.9277208315979806e-05, 'clip_ratio/low_min': 4.51475443696836e-06, 'clip_ratio/high_mean': 7.449344252563606e-07, 'clip_ratio/high_max': 2.9797377010254422e-06, 'clip_ratio/region_mean': 4.002214268439275e-05, 'epoch': 0.08} + + 8%|▊ | 86/1024 [3:40:50<41:13:05, 158.19s/it]INFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:05:50 [block_pool.py:292] Successfully reset prefix cache + + 8%|▊ | 87/1024 [3:43:28<41:06:57, 157.97s/it] + {'loss': 0.0583, 'grad_norm': 0.0044838739559054375, 'learning_rate': 1e-05, 'num_tokens': 67840310.0, 'completions/mean_length': 6539.8203125, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6303.56005859375, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 15923.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2722293734550476, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020990263670682907, 'sampling/sampling_logp_difference/max': 6.374995231628418, 'sampling/importance_sampling_ratio/min': 0.001703627873212099, 'sampling/importance_sampling_ratio/mean': 0.9999875426292419, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0071343630552292, 'clip_ratio/low_mean': 3.757404465432046e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5179480215010699e-06, 'clip_ratio/high_max': 6.0717920860042796e-06, 'clip_ratio/region_mean': 3.909199278950837e-05, 'epoch': 0.08} + + 8%|▊ | 87/1024 [3:43:28<41:06:57, 157.97s/it]INFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:08:27 [block_pool.py:292] Successfully reset prefix cache + + 9%|▊ | 88/1024 [3:46:23<42:24:42, 163.12s/it] + {'loss': -0.0057, 'grad_norm': 0.0034659637603908777, 'learning_rate': 1e-05, 'num_tokens': 68782042.0, 'completions/mean_length': 7204.09375, 'completions/min_length': 42.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6907.9677734375, 'completions/min_terminated_length': 42.0, 'completions/max_terminated_length': 16224.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.27958327531814575, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02053149789571762, 'sampling/sampling_logp_difference/max': 8.002180099487305, 'sampling/importance_sampling_ratio/min': 0.0003347320598550141, 'sampling/importance_sampling_ratio/mean': 0.9999324083328247, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9221752807497978, 'clip_ratio/low_mean': 3.50394579982094e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.14752542307906e-06, 'clip_ratio/high_max': 2.859010169231624e-05, 'clip_ratio/region_mean': 4.218698381919239e-05, 'epoch': 0.08} + + 9%|▊ | 88/1024 [3:46:23<42:24:42, 163.12s/it]INFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:11:23 [block_pool.py:292] Successfully reset prefix cache + + 9%|▊ | 89/1024 [3:49:00<41:53:39, 161.30s/it] + {'loss': 0.0584, 'grad_norm': 0.0024458845146000385, 'learning_rate': 1e-05, 'num_tokens': 69526295.0, 'completions/mean_length': 5662.1640625, 'completions/min_length': 391.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5577.740234375, 'completions/min_terminated_length': 391.0, 'completions/max_terminated_length': 14764.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.18543373048305511, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018346723169088364, 'sampling/sampling_logp_difference/max': 5.6851115226745605, 'sampling/importance_sampling_ratio/min': 0.0033961546141654253, 'sampling/importance_sampling_ratio/mean': 0.9999278783798218, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9678512960672379, 'clip_ratio/low_mean': 2.086669928758056e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.355054784355161e-06, 'clip_ratio/high_max': 1.7420219137420645e-05, 'clip_ratio/region_mean': 2.522175350350153e-05, 'epoch': 0.08} + + 9%|▊ | 89/1024 [3:49:00<41:53:39, 161.30s/it]INFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:14:00 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 90/1024 [3:51:27<40:46:23, 157.16s/it] + {'loss': 0.0719, 'grad_norm': 0.004733253736048937, 'learning_rate': 1e-05, 'num_tokens': 70262771.0, 'completions/mean_length': 5590.71875, 'completions/min_length': 382.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5505.732421875, 'completions/min_terminated_length': 382.0, 'completions/max_terminated_length': 16219.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.26933354139328003, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019460031762719154, 'sampling/sampling_logp_difference/max': 11.303396224975586, 'sampling/importance_sampling_ratio/min': 1.233097464137245e-05, 'sampling/importance_sampling_ratio/mean': 0.9999312162399292, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9286820441484451, 'clip_ratio/low_mean': 1.8629728629093734e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0234394924045773e-06, 'clip_ratio/high_max': 8.09375796961831e-06, 'clip_ratio/region_mean': 2.0653167894124635e-05, 'epoch': 0.08} + + 9%|▉ | 90/1024 [3:51:27<40:46:23, 157.16s/it]INFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:16:27 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 91/1024 [3:54:03<40:36:27, 156.69s/it] + {'loss': 0.0223, 'grad_norm': 0.00468763243407011, 'learning_rate': 1e-05, 'num_tokens': 71079953.0, 'completions/mean_length': 6182.484375, 'completions/min_length': 319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6102.1572265625, 'completions/min_terminated_length': 319.0, 'completions/max_terminated_length': 15879.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.26933354139328003, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02069907821714878, 'sampling/sampling_logp_difference/max': 9.24995231628418, 'sampling/importance_sampling_ratio/min': 9.611623681848869e-05, 'sampling/importance_sampling_ratio/mean': 1.0000090599060059, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0872880518436432, 'clip_ratio/low_mean': 2.489819087259093e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.592780669554486e-06, 'clip_ratio/high_max': 1.8371122678217944e-05, 'clip_ratio/region_mean': 2.949097142845858e-05, 'epoch': 0.08} + + 9%|▉ | 91/1024 [3:54:03<40:36:27, 156.69s/it]INFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:19:03 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 92/1024 [3:56:35<40:09:41, 155.13s/it] + {'loss': 0.0642, 'grad_norm': 0.0034273737110197544, 'learning_rate': 1e-05, 'num_tokens': 71856574.0, 'completions/mean_length': 5909.2265625, 'completions/min_length': 433.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5826.748046875, 'completions/min_terminated_length': 433.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019539739936590195, 'sampling/sampling_logp_difference/max': 8.687297821044922, 'sampling/importance_sampling_ratio/min': 0.00016871529805939645, 'sampling/importance_sampling_ratio/mean': 0.9998411536216736, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9488153457641602, 'clip_ratio/low_mean': 2.6412633246764017e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.633066396309005e-06, 'clip_ratio/high_max': 1.579416039021453e-05, 'clip_ratio/region_mean': 3.1045699415699346e-05, 'epoch': 0.08} + + 9%|▉ | 92/1024 [3:56:35<40:09:41, 155.13s/it]INFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:21:34 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 93/1024 [3:59:10<40:07:39, 155.17s/it] + {'loss': 0.0314, 'grad_norm': 0.003149663796648383, 'learning_rate': 1e-05, 'num_tokens': 72696806.0, 'completions/mean_length': 6381.3125, 'completions/min_length': 58.0, 'completions/max_length': 15933.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6381.3125, 'completions/min_terminated_length': 58.0, 'completions/max_terminated_length': 15933.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021038895472884178, 'sampling/sampling_logp_difference/max': 7.997447967529297, 'sampling/importance_sampling_ratio/min': 0.00033631984842941165, 'sampling/importance_sampling_ratio/mean': 0.999916136264801, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9708949401974678, 'clip_ratio/low_mean': 4.2946558664880286e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.362454420624999e-07, 'clip_ratio/high_max': 3.7449817682499997e-06, 'clip_ratio/region_mean': 4.388280387956911e-05, 'epoch': 0.09} + + 9%|▉ | 93/1024 [3:59:10<40:07:39, 155.17s/it]INFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:24:09 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 94/1024 [4:01:44<40:00:37, 154.88s/it] + {'loss': 0.0355, 'grad_norm': 0.0029015145264565945, 'learning_rate': 1e-05, 'num_tokens': 73449210.0, 'completions/mean_length': 5726.03125, 'completions/min_length': 831.0, 'completions/max_length': 16180.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5726.03125, 'completions/min_terminated_length': 831.0, 'completions/max_terminated_length': 16180.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020026210695505142, 'sampling/sampling_logp_difference/max': 8.68747615814209, 'sampling/importance_sampling_ratio/min': 0.0001686852192506194, 'sampling/importance_sampling_ratio/mean': 0.9999687671661377, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9100239053368568, 'clip_ratio/low_mean': 4.956343445883249e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6230393384830677e-06, 'clip_ratio/high_max': 6.492157353932271e-06, 'clip_ratio/region_mean': 5.118647413837607e-05, 'epoch': 0.09} + + 9%|▉ | 94/1024 [4:01:44<40:00:37, 154.88s/it]INFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:26:44 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 95/1024 [4:04:03<38:42:35, 150.01s/it] + {'loss': 0.0387, 'grad_norm': 0.0045582144521176815, 'learning_rate': 1e-05, 'num_tokens': 74212662.0, 'completions/mean_length': 5824.90625, 'completions/min_length': 364.0, 'completions/max_length': 15624.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5824.90625, 'completions/min_terminated_length': 364.0, 'completions/max_terminated_length': 15624.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.24777324497699738, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019039880484342575, 'sampling/sampling_logp_difference/max': 6.146263599395752, 'sampling/importance_sampling_ratio/min': 0.0021414682269096375, 'sampling/importance_sampling_ratio/mean': 1.0000125169754028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9244210943579674, 'clip_ratio/low_mean': 1.4287397789303213e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.99904036182852e-06, 'clip_ratio/high_max': 1.199616144731408e-05, 'clip_ratio/region_mean': 1.7286438151131733e-05, 'epoch': 0.09} + + 9%|▉ | 95/1024 [4:04:03<38:42:35, 150.01s/it]INFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:29:02 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 96/1024 [4:06:34<38:48:11, 150.53s/it] + {'loss': 0.0776, 'grad_norm': 0.0040692174807190895, 'learning_rate': 1e-05, 'num_tokens': 75054003.0, 'completions/mean_length': 6432.7265625, 'completions/min_length': 199.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6274.77001953125, 'completions/min_terminated_length': 199.0, 'completions/max_terminated_length': 15600.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.35506343841552734, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019711513072252274, 'sampling/sampling_logp_difference/max': 5.194499492645264, 'sampling/importance_sampling_ratio/min': 0.005546991713345051, 'sampling/importance_sampling_ratio/mean': 0.9998587369918823, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8756264597177505, 'clip_ratio/low_mean': 4.0637585470904014e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.527106175875815e-06, 'clip_ratio/high_max': 1.010842470350326e-05, 'clip_ratio/region_mean': 4.316469153309299e-05, 'epoch': 0.09} + + 9%|▉ | 96/1024 [4:06:34<38:48:11, 150.53s/it]INFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:31:34 [block_pool.py:292] Successfully reset prefix cache + + 9%|▉ | 97/1024 [4:08:58<38:15:55, 148.60s/it] + {'loss': 0.1137, 'grad_norm': 0.0035478502977639437, 'learning_rate': 1e-05, 'num_tokens': 75773194.0, 'completions/mean_length': 5474.6796875, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5388.779296875, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 14589.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.26037710905075073, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018789665773510933, 'sampling/sampling_logp_difference/max': 5.454678535461426, 'sampling/importance_sampling_ratio/min': 0.004276251420378685, 'sampling/importance_sampling_ratio/mean': 1.0000132322311401, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9279408678412437, 'clip_ratio/low_mean': 3.6582903135240485e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.6582903135240485e-05, 'epoch': 0.09} + + 9%|▉ | 97/1024 [4:08:58<38:15:55, 148.60s/it]INFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:33:58 [block_pool.py:292] Successfully reset prefix cache + + 10%|▉ | 98/1024 [4:11:30<38:29:01, 149.61s/it] + {'loss': 0.0681, 'grad_norm': 0.004816337022930384, 'learning_rate': 1e-05, 'num_tokens': 76654837.0, 'completions/mean_length': 6730.2734375, 'completions/min_length': 235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6577.0400390625, 'completions/min_terminated_length': 235.0, 'completions/max_terminated_length': 15653.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.35325103998184204, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021000642329454422, 'sampling/sampling_logp_difference/max': 13.464577674865723, 'sampling/importance_sampling_ratio/min': 1.4203919818100985e-06, 'sampling/importance_sampling_ratio/mean': 1.0000003576278687, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0115349367260933, 'clip_ratio/low_mean': 4.1461861655989196e-05, 'clip_ratio/low_min': 3.5008122267754516e-06, 'clip_ratio/high_mean': 2.0568871832438163e-06, 'clip_ratio/high_max': 8.227548732975265e-06, 'clip_ratio/region_mean': 4.351874804342515e-05, 'epoch': 0.09} + + 10%|▉ | 98/1024 [4:11:30<38:29:01, 149.61s/it]INFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:36:30 [block_pool.py:292] Successfully reset prefix cache + + 10%|▉ | 99/1024 [4:13:47<37:27:34, 145.79s/it] + {'loss': -0.0188, 'grad_norm': 0.00695947976782918, 'learning_rate': 1e-05, 'num_tokens': 77287704.0, 'completions/mean_length': 4804.5859375, 'completions/min_length': 54.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 4620.7861328125, 'completions/min_terminated_length': 54.0, 'completions/max_terminated_length': 14350.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.2688046097755432, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019261913374066353, 'sampling/sampling_logp_difference/max': 2.9661245346069336, 'sampling/importance_sampling_ratio/min': 0.051502522081136703, 'sampling/importance_sampling_ratio/mean': 1.000001072883606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8622925356030464, 'clip_ratio/low_mean': 2.399133984454238e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.208268930800841e-06, 'clip_ratio/high_max': 2.0833075723203365e-05, 'clip_ratio/region_mean': 2.919960945746425e-05, 'epoch': 0.09} + + 10%|▉ | 99/1024 [4:13:47<37:27:34, 145.79s/it]INFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:38:47 [block_pool.py:292] Successfully reset prefix cache + + 10%|▉ | 100/1024 [4:16:25<38:20:47, 149.40s/it] + {'loss': 0.1412, 'grad_norm': 0.0034830078948289156, 'learning_rate': 1e-05, 'num_tokens': 78054048.0, 'completions/mean_length': 5836.25, 'completions/min_length': 310.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5753.19677734375, 'completions/min_terminated_length': 310.0, 'completions/max_terminated_length': 15997.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.29036492109298706, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01845550537109375, 'sampling/sampling_logp_difference/max': 12.792928695678711, 'sampling/importance_sampling_ratio/min': 2.7803641842183424e-06, 'sampling/importance_sampling_ratio/mean': 0.9999365210533142, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8808795213699341, 'clip_ratio/low_mean': 3.53349669239833e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.221566203137627e-06, 'clip_ratio/high_max': 1.2886264812550507e-05, 'clip_ratio/region_mean': 3.8556532899747253e-05, 'epoch': 0.09} + + 10%|▉ | 100/1024 [4:16:25<38:20:47, 149.40s/it]INFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:41:25 [block_pool.py:292] Successfully reset prefix cache + + 10%|▉ | 101/1024 [4:18:52<38:07:51, 148.72s/it] + {'loss': -0.0024, 'grad_norm': 0.0028610217850655317, 'learning_rate': 1e-05, 'num_tokens': 78765225.0, 'completions/mean_length': 5407.5703125, 'completions/min_length': 374.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5233.341796875, 'completions/min_terminated_length': 374.0, 'completions/max_terminated_length': 13964.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.26037710905075073, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018839653581380844, 'sampling/sampling_logp_difference/max': 9.742315292358398, 'sampling/importance_sampling_ratio/min': 5.874436828889884e-05, 'sampling/importance_sampling_ratio/mean': 0.9999171495437622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9438152015209198, 'clip_ratio/low_mean': 3.4728200375866436e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.57742361909186e-06, 'clip_ratio/high_max': 2.630969447636744e-05, 'clip_ratio/region_mean': 4.1305623994958296e-05, 'epoch': 0.09} + + 10%|▉ | 101/1024 [4:18:52<38:07:51, 148.72s/it]INFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:43:52 [block_pool.py:292] Successfully reset prefix cache + + 10%|▉ | 102/1024 [4:21:39<39:29:10, 154.18s/it] + {'loss': 0.0626, 'grad_norm': 0.004098972305655479, 'learning_rate': 1e-05, 'num_tokens': 79628691.0, 'completions/mean_length': 6591.765625, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6436.33349609375, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15780.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.26932865381240845, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02011241763830185, 'sampling/sampling_logp_difference/max': 6.386111259460449, 'sampling/importance_sampling_ratio/min': 0.001684795250184834, 'sampling/importance_sampling_ratio/mean': 0.9999697208404541, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9185260459780693, 'clip_ratio/low_mean': 3.569766681721376e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.917444360013178e-06, 'clip_ratio/high_max': 1.2485550996643724e-05, 'clip_ratio/region_mean': 3.961511060879275e-05, 'epoch': 0.09} + + 10%|▉ | 102/1024 [4:21:39<39:29:10, 154.18s/it]INFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:46:39 [block_pool.py:292] Successfully reset prefix cache + + 10%|█ | 103/1024 [4:24:25<40:18:08, 157.53s/it] + {'loss': 0.0695, 'grad_norm': 0.003109709592536092, 'learning_rate': 1e-05, 'num_tokens': 80513135.0, 'completions/mean_length': 6762.40625, 'completions/min_length': 181.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6371.2841796875, 'completions/min_terminated_length': 181.0, 'completions/max_terminated_length': 16014.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.27274850010871887, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021886618807911873, 'sampling/sampling_logp_difference/max': 5.6049675941467285, 'sampling/importance_sampling_ratio/min': 0.0036795397754758596, 'sampling/importance_sampling_ratio/mean': 0.999967098236084, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0496173724532127, 'clip_ratio/low_mean': 2.3897301389297354e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.543192294979235e-06, 'clip_ratio/high_max': 1.017276917991694e-05, 'clip_ratio/region_mean': 2.644049368427659e-05, 'epoch': 0.09} + + 10%|█ | 103/1024 [4:24:25<40:18:08, 157.53s/it]INFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:49:24 [block_pool.py:292] Successfully reset prefix cache + + 10%|█ | 104/1024 [4:27:18<41:27:11, 162.21s/it] + {'loss': 0.1028, 'grad_norm': 0.0050065224058926105, 'learning_rate': 1e-05, 'num_tokens': 81579941.0, 'completions/mean_length': 8151.421875, 'completions/min_length': 1052.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7528.79052734375, 'completions/min_terminated_length': 1052.0, 'completions/max_terminated_length': 15653.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.36691081523895264, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02017449401319027, 'sampling/sampling_logp_difference/max': 7.187410831451416, 'sampling/importance_sampling_ratio/min': 0.0007560441154055297, 'sampling/importance_sampling_ratio/mean': 0.9999760389328003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8989155367016792, 'clip_ratio/low_mean': 5.0279177912671e-05, 'clip_ratio/low_min': 6.849113788121031e-06, 'clip_ratio/high_mean': 2.6558238346297003e-06, 'clip_ratio/high_max': 1.0623295338518801e-05, 'clip_ratio/region_mean': 5.29350020315178e-05, 'epoch': 0.1} + + 10%|█ | 104/1024 [4:27:18<41:27:11, 162.21s/it]INFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:52:17 [block_pool.py:292] Successfully reset prefix cache + + 10%|█ | 105/1024 [4:29:53<40:52:02, 160.09s/it] + {'loss': 0.0927, 'grad_norm': 0.00352756236679852, 'learning_rate': 1e-05, 'num_tokens': 82479474.0, 'completions/mean_length': 6871.7265625, 'completions/min_length': 1044.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6643.43212890625, 'completions/min_terminated_length': 1044.0, 'completions/max_terminated_length': 16094.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.33296146988868713, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021244853734970093, 'sampling/sampling_logp_difference/max': 3.749523162841797, 'sampling/importance_sampling_ratio/min': 0.023528963327407837, 'sampling/importance_sampling_ratio/mean': 1.000028133392334, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.006680078804493, 'clip_ratio/low_mean': 4.2927287609018094e-05, 'clip_ratio/low_min': 4.201963292871369e-06, 'clip_ratio/high_mean': 1.9156864254910033e-06, 'clip_ratio/high_max': 7.662745701964013e-06, 'clip_ratio/region_mean': 4.484297357976175e-05, 'epoch': 0.1} + + 10%|█ | 105/1024 [4:29:53<40:52:02, 160.09s/it]INFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:54:52 [block_pool.py:292] Successfully reset prefix cache + + 10%|█ | 106/1024 [4:32:08<38:56:25, 152.71s/it] + {'loss': -0.0169, 'grad_norm': 0.002348776441067457, 'learning_rate': 1e-05, 'num_tokens': 83229071.0, 'completions/mean_length': 5705.6015625, 'completions/min_length': 802.0, 'completions/max_length': 14462.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5705.6015625, 'completions/min_terminated_length': 802.0, 'completions/max_terminated_length': 14462.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29249149560928345, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01885361783206463, 'sampling/sampling_logp_difference/max': 11.35004997253418, 'sampling/importance_sampling_ratio/min': 1.176890145870857e-05, 'sampling/importance_sampling_ratio/mean': 0.9999898672103882, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9162084609270096, 'clip_ratio/low_mean': 2.3860119426899473e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.544197733797773e-06, 'clip_ratio/high_max': 1.6621729173493804e-05, 'clip_ratio/region_mean': 2.9404316592263058e-05, 'epoch': 0.1} + + 10%|█ | 106/1024 [4:32:08<38:56:25, 152.71s/it]INFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:57:08 [block_pool.py:292] Successfully reset prefix cache + + 10%|█ | 107/1024 [4:34:46<39:16:24, 154.18s/it] + {'loss': 0.038, 'grad_norm': 0.005057404283434153, 'learning_rate': 1e-05, 'num_tokens': 84119947.0, 'completions/mean_length': 6823.90625, 'completions/min_length': 129.0, 'completions/max_length': 16110.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6823.90625, 'completions/min_terminated_length': 129.0, 'completions/max_terminated_length': 16110.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.31246691942214966, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021600374951958656, 'sampling/sampling_logp_difference/max': 4.219791412353516, 'sampling/importance_sampling_ratio/min': 0.014701711013913155, 'sampling/importance_sampling_ratio/mean': 0.9999507665634155, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0139815732836723, 'clip_ratio/low_mean': 5.359476631383586e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.359476631383586e-05, 'epoch': 0.1} + + 10%|█ | 107/1024 [4:34:46<39:16:24, 154.18s/it]INFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 17:59:46 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 108/1024 [4:37:04<38:01:32, 149.45s/it] + {'loss': 0.0506, 'grad_norm': 0.008517255075275898, 'learning_rate': 1e-05, 'num_tokens': 84879833.0, 'completions/mean_length': 5786.859375, 'completions/min_length': 643.0, 'completions/max_length': 15516.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5786.859375, 'completions/min_terminated_length': 643.0, 'completions/max_terminated_length': 15516.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.3311441242694855, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01993538998067379, 'sampling/sampling_logp_difference/max': 9.187470436096191, 'sampling/importance_sampling_ratio/min': 0.00010231334454147145, 'sampling/importance_sampling_ratio/mean': 0.9999799728393555, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0515320897102356, 'clip_ratio/low_mean': 3.813199691649061e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.10628331337648e-06, 'clip_ratio/high_max': 1.642513325350592e-05, 'clip_ratio/region_mean': 4.2238279775119736e-05, 'epoch': 0.1} + + 11%|█ | 108/1024 [4:37:04<38:01:32, 149.45s/it]INFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:02:04 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 109/1024 [4:39:34<37:59:46, 149.49s/it] + {'loss': 0.0214, 'grad_norm': 0.0034334585070610046, 'learning_rate': 1e-05, 'num_tokens': 85503162.0, 'completions/mean_length': 4726.2578125, 'completions/min_length': 406.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4634.46435546875, 'completions/min_terminated_length': 406.0, 'completions/max_terminated_length': 15836.0, 'rewards/accuracy_reward/mean': 0.6015625, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.6015625, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018191032111644745, 'sampling/sampling_logp_difference/max': 5.9298248291015625, 'sampling/importance_sampling_ratio/min': 0.0026589478366076946, 'sampling/importance_sampling_ratio/mean': 1.0000437498092651, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.795353539288044, 'clip_ratio/low_mean': 1.4313530300569255e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7510926682007266e-06, 'clip_ratio/high_max': 7.0043706728029065e-06, 'clip_ratio/region_mean': 1.606462308245682e-05, 'epoch': 0.1} + + 11%|█ | 109/1024 [4:39:34<37:59:46, 149.49s/it]INFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:04:34 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 110/1024 [4:42:27<39:46:57, 156.69s/it] + {'loss': 0.0811, 'grad_norm': 0.006242698058485985, 'learning_rate': 1e-05, 'num_tokens': 86350364.0, 'completions/mean_length': 6450.140625, 'completions/min_length': 401.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5787.8837890625, 'completions/min_terminated_length': 401.0, 'completions/max_terminated_length': 14514.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.27540695667266846, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01948007568717003, 'sampling/sampling_logp_difference/max': 8.794099807739258, 'sampling/importance_sampling_ratio/min': 0.00015162504860199988, 'sampling/importance_sampling_ratio/mean': 0.9999819993972778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8920315206050873, 'clip_ratio/low_mean': 3.989860044839588e-05, 'clip_ratio/low_min': 4.927079316985328e-06, 'clip_ratio/high_mean': 1.037309971252398e-06, 'clip_ratio/high_max': 4.149239885009592e-06, 'clip_ratio/region_mean': 4.093591041964828e-05, 'epoch': 0.1} + + 11%|█ | 110/1024 [4:42:27<39:46:57, 156.69s/it]INFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:07:27 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 111/1024 [4:45:07<39:57:26, 157.55s/it] + {'loss': 0.018, 'grad_norm': 0.002594202058389783, 'learning_rate': 1e-05, 'num_tokens': 87213277.0, 'completions/mean_length': 6597.9453125, 'completions/min_length': 657.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6442.611328125, 'completions/min_terminated_length': 657.0, 'completions/max_terminated_length': 15253.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.3061561584472656, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02082553133368492, 'sampling/sampling_logp_difference/max': 4.905908584594727, 'sampling/importance_sampling_ratio/min': 0.007402713876217604, 'sampling/importance_sampling_ratio/mean': 0.9998740553855896, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9351271465420723, 'clip_ratio/low_mean': 2.8560575628944207e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8426849237584975e-06, 'clip_ratio/high_max': 4.065173015987966e-06, 'clip_ratio/region_mean': 3.0403260552702704e-05, 'epoch': 0.1} + + 11%|█ | 111/1024 [4:45:07<39:57:26, 157.55s/it]INFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:10:07 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 112/1024 [4:47:46<40:03:31, 158.13s/it] + {'loss': 0.0757, 'grad_norm': 0.002718541072681546, 'learning_rate': 1e-05, 'num_tokens': 88144530.0, 'completions/mean_length': 7109.9140625, 'completions/min_length': 881.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7036.8896484375, 'completions/min_terminated_length': 881.0, 'completions/max_terminated_length': 15955.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.26485776901245117, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01960277371108532, 'sampling/sampling_logp_difference/max': 8.36449146270752, 'sampling/importance_sampling_ratio/min': 0.0002329955023014918, 'sampling/importance_sampling_ratio/mean': 0.999973714351654, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8797949478030205, 'clip_ratio/low_mean': 4.297400278119312e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.382130201629479e-07, 'clip_ratio/high_max': 3.7528520806517918e-06, 'clip_ratio/region_mean': 4.391221568766923e-05, 'epoch': 0.1} + + 11%|█ | 112/1024 [4:47:46<40:03:31, 158.13s/it]INFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:12:46 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 113/1024 [4:50:43<41:23:36, 163.57s/it] + {'loss': 0.0854, 'grad_norm': 0.003097688313573599, 'learning_rate': 1e-05, 'num_tokens': 89109897.0, 'completions/mean_length': 7361.6796875, 'completions/min_length': 624.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6513.427734375, 'completions/min_terminated_length': 624.0, 'completions/max_terminated_length': 15834.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3148210048675537, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01997425965964794, 'sampling/sampling_logp_difference/max': 6.834630012512207, 'sampling/importance_sampling_ratio/min': 0.0010758653515949845, 'sampling/importance_sampling_ratio/mean': 0.9998917579650879, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9020541086792946, 'clip_ratio/low_mean': 4.423825043886609e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.813705350490636e-06, 'clip_ratio/high_max': 1.1254821401962545e-05, 'clip_ratio/region_mean': 4.7051955789356725e-05, 'epoch': 0.1} + + 11%|█ | 113/1024 [4:50:43<41:23:36, 163.57s/it]INFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:15:42 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 114/1024 [4:53:25<41:16:13, 163.27s/it] + {'loss': 0.0869, 'grad_norm': 0.0023438548669219017, 'learning_rate': 1e-05, 'num_tokens': 89891429.0, 'completions/mean_length': 5957.28125, 'completions/min_length': 749.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5620.935546875, 'completions/min_terminated_length': 749.0, 'completions/max_terminated_length': 15608.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3713865876197815, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018976174294948578, 'sampling/sampling_logp_difference/max': 11.706428527832031, 'sampling/importance_sampling_ratio/min': 8.2406731962692e-06, 'sampling/importance_sampling_ratio/mean': 0.9998185634613037, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8262394368648529, 'clip_ratio/low_mean': 7.228819413285237e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.248351158115838e-06, 'clip_ratio/high_max': 1.8235970401292434e-05, 'clip_ratio/region_mean': 7.753654563202872e-05, 'epoch': 0.1} + + 11%|█ | 114/1024 [4:53:25<41:16:13, 163.27s/it]INFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:18:25 [block_pool.py:292] Successfully reset prefix cache + + 11%|█ | 115/1024 [4:55:59<40:29:03, 160.33s/it] + {'loss': 0.0411, 'grad_norm': 0.005619170609861612, 'learning_rate': 1e-05, 'num_tokens': 90600721.0, 'completions/mean_length': 5405.53125, 'completions/min_length': 230.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5142.04833984375, 'completions/min_terminated_length': 230.0, 'completions/max_terminated_length': 15509.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.40821409225463867, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01931554079055786, 'sampling/sampling_logp_difference/max': 16.351388931274414, 'sampling/importance_sampling_ratio/min': 7.91921266340978e-08, 'sampling/importance_sampling_ratio/mean': 0.9999438524246216, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9246686547994614, 'clip_ratio/low_mean': 5.1420432782833814e-05, 'clip_ratio/low_min': 6.1973228184797335e-06, 'clip_ratio/high_mean': 5.4644419833493885e-06, 'clip_ratio/high_max': 1.6280149793601595e-05, 'clip_ratio/region_mean': 5.688487522093055e-05, 'epoch': 0.11} + + 11%|█ | 115/1024 [4:55:59<40:29:03, 160.33s/it]INFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:20:58 [block_pool.py:292] Successfully reset prefix cache + + 11%|█▏ | 116/1024 [4:58:43<40:43:24, 161.46s/it] + {'loss': 0.0379, 'grad_norm': 0.006043895613402128, 'learning_rate': 1e-05, 'num_tokens': 91486063.0, 'completions/mean_length': 6754.859375, 'completions/min_length': 2.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6363.4306640625, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 16106.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2527858018875122, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02107170596718788, 'sampling/sampling_logp_difference/max': 12.875, 'sampling/importance_sampling_ratio/min': 2.5612887384340866e-06, 'sampling/importance_sampling_ratio/mean': 0.9999067783355713, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.952000230550766, 'clip_ratio/low_mean': 3.463903834699522e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.307115153143968e-06, 'clip_ratio/high_max': 9.228460612575873e-06, 'clip_ratio/region_mean': 3.694615350013919e-05, 'epoch': 0.11} + + 11%|█▏ | 116/1024 [4:58:43<40:43:24, 161.46s/it]INFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:23:42 [block_pool.py:292] Successfully reset prefix cache + + 11%|█▏ | 117/1024 [5:01:50<42:38:56, 169.28s/it] + {'loss': 0.0666, 'grad_norm': 0.00392121123149991, 'learning_rate': 1e-05, 'num_tokens': 92546920.0, 'completions/mean_length': 8135.8203125, 'completions/min_length': 649.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7869.75, 'completions/min_terminated_length': 649.0, 'completions/max_terminated_length': 16377.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2977413833141327, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02211480587720871, 'sampling/sampling_logp_difference/max': 10.189286231994629, 'sampling/importance_sampling_ratio/min': 3.757069134735502e-05, 'sampling/importance_sampling_ratio/mean': 0.9999874830245972, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0832853615283966, 'clip_ratio/low_mean': 3.14642731495951e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.65198184226756e-06, 'clip_ratio/high_max': 1.460792736907024e-05, 'clip_ratio/region_mean': 3.511625499186266e-05, 'epoch': 0.11} + + 11%|█▏ | 117/1024 [5:01:50<42:38:56, 169.28s/it]INFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:26:50 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 118/1024 [5:04:15<40:43:37, 161.83s/it] + {'loss': 0.0378, 'grad_norm': 0.00480870483443141, 'learning_rate': 1e-05, 'num_tokens': 93270524.0, 'completions/mean_length': 5476.53125, 'completions/min_length': 666.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5214.75244140625, 'completions/min_terminated_length': 666.0, 'completions/max_terminated_length': 15497.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3243093490600586, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01990744285285473, 'sampling/sampling_logp_difference/max': 3.5937137603759766, 'sampling/importance_sampling_ratio/min': 0.02749602682888508, 'sampling/importance_sampling_ratio/mean': 1.000068187713623, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0261689275503159, 'clip_ratio/low_mean': 3.652223790595599e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.964218977780547e-06, 'clip_ratio/high_max': 3.585687591112219e-05, 'clip_ratio/region_mean': 4.548645733848389e-05, 'epoch': 0.11} + + 12%|█▏ | 118/1024 [5:04:15<40:43:37, 161.83s/it]INFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:29:14 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 119/1024 [5:07:14<41:58:40, 166.98s/it] + {'loss': 0.0792, 'grad_norm': 0.003411791054531932, 'learning_rate': 1e-05, 'num_tokens': 94271404.0, 'completions/mean_length': 7670.0625, 'completions/min_length': 964.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7165.9501953125, 'completions/min_terminated_length': 964.0, 'completions/max_terminated_length': 16209.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.28117600083351135, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01960139349102974, 'sampling/sampling_logp_difference/max': 13.061310768127441, 'sampling/importance_sampling_ratio/min': 2.125909531969228e-06, 'sampling/importance_sampling_ratio/mean': 0.999955415725708, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8719229996204376, 'clip_ratio/low_mean': 3.6732255466631614e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2816832395401434e-06, 'clip_ratio/high_max': 5.126732958160574e-06, 'clip_ratio/region_mean': 3.8013938819858595e-05, 'epoch': 0.11} + + 12%|█▏ | 119/1024 [5:07:14<41:58:40, 166.98s/it]INFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:32:13 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 120/1024 [5:09:43<40:34:10, 161.56s/it] + {'loss': 0.0852, 'grad_norm': 0.0036615384742617607, 'learning_rate': 1e-05, 'num_tokens': 94998263.0, 'completions/mean_length': 5499.0859375, 'completions/min_length': 867.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5413.3779296875, 'completions/min_terminated_length': 867.0, 'completions/max_terminated_length': 15284.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.27776598930358887, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01831059902906418, 'sampling/sampling_logp_difference/max': 8.126622200012207, 'sampling/importance_sampling_ratio/min': 0.00029556488152593374, 'sampling/importance_sampling_ratio/mean': 0.9999586939811707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8891193494200706, 'clip_ratio/low_mean': 3.3884271260831156e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0189622685174982e-05, 'clip_ratio/high_max': 3.2011115308705484e-05, 'clip_ratio/region_mean': 4.4073893604945624e-05, 'epoch': 0.11} + + 12%|█▏ | 120/1024 [5:09:43<40:34:10, 161.56s/it]INFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:34:42 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 121/1024 [5:12:30<40:58:04, 163.33s/it] + {'loss': 0.0704, 'grad_norm': 0.003688640194013715, 'learning_rate': 1e-05, 'num_tokens': 96020572.0, 'completions/mean_length': 7831.1015625, 'completions/min_length': 855.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7410.466796875, 'completions/min_terminated_length': 855.0, 'completions/max_terminated_length': 15605.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.266974538564682, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020766064524650574, 'sampling/sampling_logp_difference/max': 7.095963478088379, 'sampling/importance_sampling_ratio/min': 0.0008284422219730914, 'sampling/importance_sampling_ratio/mean': 1.0000081062316895, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9511109218001366, 'clip_ratio/low_mean': 3.4662164466681133e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.505237830519036e-06, 'clip_ratio/high_max': 1.0020951322076144e-05, 'clip_ratio/region_mean': 3.716740218351333e-05, 'epoch': 0.11} + + 12%|█▏ | 121/1024 [5:12:30<40:58:04, 163.33s/it]INFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:37:30 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 122/1024 [5:15:40<42:56:43, 171.40s/it] + {'loss': 0.0796, 'grad_norm': 0.002527788048610091, 'learning_rate': 1e-05, 'num_tokens': 97055892.0, 'completions/mean_length': 7928.5, 'completions/min_length': 289.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7584.7802734375, 'completions/min_terminated_length': 289.0, 'completions/max_terminated_length': 16267.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.22567617893218994, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02571871504187584, 'sampling/sampling_logp_difference/max': 11.72396469116211, 'sampling/importance_sampling_ratio/min': 8.097423233266454e-06, 'sampling/importance_sampling_ratio/mean': 0.999517560005188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.053833745419979, 'clip_ratio/low_mean': 4.2512260733929e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0797083405122976e-06, 'clip_ratio/high_max': 4.31883336204919e-06, 'clip_ratio/region_mean': 4.359196918812813e-05, 'epoch': 0.11} + + 12%|█▏ | 122/1024 [5:15:40<42:56:43, 171.40s/it]INFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:40:40 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 123/1024 [5:18:20<42:01:35, 167.92s/it] + {'loss': 0.0524, 'grad_norm': 0.004057250916957855, 'learning_rate': 1e-05, 'num_tokens': 98026604.0, 'completions/mean_length': 7433.0, 'completions/min_length': 1112.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7218.17626953125, 'completions/min_terminated_length': 1112.0, 'completions/max_terminated_length': 15282.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.30274903774261475, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020892417058348656, 'sampling/sampling_logp_difference/max': 5.936958312988281, 'sampling/importance_sampling_ratio/min': 0.0026400478091090918, 'sampling/importance_sampling_ratio/mean': 0.9999719858169556, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0001763850450516, 'clip_ratio/low_mean': 5.3688914704252966e-05, 'clip_ratio/low_min': 1.0726187383625074e-05, 'clip_ratio/high_mean': 5.360034492696286e-06, 'clip_ratio/high_max': 2.1440137970785145e-05, 'clip_ratio/region_mean': 5.904894931063609e-05, 'epoch': 0.11} + + 12%|█▏ | 123/1024 [5:18:20<42:01:35, 167.92s/it]INFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:43:20 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 124/1024 [5:21:03<41:35:03, 166.34s/it] + {'loss': 0.0561, 'grad_norm': 0.004367270041257143, 'learning_rate': 1e-05, 'num_tokens': 98882667.0, 'completions/mean_length': 6529.8046875, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6211.92724609375, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15435.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020555414259433746, 'sampling/sampling_logp_difference/max': 8.874999046325684, 'sampling/importance_sampling_ratio/min': 0.00013984176621306688, 'sampling/importance_sampling_ratio/mean': 0.9999692440032959, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0204281583428383, 'clip_ratio/low_mean': 3.0267089357494115e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8002238562075945e-06, 'clip_ratio/high_max': 7.200895424830378e-06, 'clip_ratio/region_mean': 3.206731355476222e-05, 'epoch': 0.11} + + 12%|█▏ | 124/1024 [5:21:03<41:35:03, 166.34s/it]INFO 12-01 18:46:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:46:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:46:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:46:03 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 125/1024 [5:24:00<42:22:12, 169.67s/it] + {'loss': 0.027, 'grad_norm': 0.0014496444491669536, 'learning_rate': 1e-05, 'num_tokens': 99847384.0, 'completions/mean_length': 7329.9140625, 'completions/min_length': 525.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6806.12353515625, 'completions/min_terminated_length': 525.0, 'completions/max_terminated_length': 15737.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019216356799006462, 'sampling/sampling_logp_difference/max': 10.749985694885254, 'sampling/importance_sampling_ratio/min': 2.1445715901791118e-05, 'sampling/importance_sampling_ratio/mean': 0.9999719262123108, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8461082950234413, 'clip_ratio/low_mean': 3.819216192368913e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.958261901170772e-07, 'clip_ratio/high_max': 3.583304760468309e-06, 'clip_ratio/region_mean': 3.908798782958911e-05, 'epoch': 0.11} + + 12%|█▏ | 125/1024 [5:24:00<42:22:12, 169.67s/it]INFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:49:00 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 126/1024 [5:26:42<41:42:17, 167.19s/it] + {'loss': 0.0475, 'grad_norm': 0.006009541917592287, 'learning_rate': 1e-05, 'num_tokens': 100699437.0, 'completions/mean_length': 6518.4765625, 'completions/min_length': 969.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6200.23388671875, 'completions/min_terminated_length': 969.0, 'completions/max_terminated_length': 15200.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01985173299908638, 'sampling/sampling_logp_difference/max': 9.606365203857422, 'sampling/importance_sampling_ratio/min': 6.729899905622005e-05, 'sampling/importance_sampling_ratio/mean': 0.9999701976776123, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.880072832107544, 'clip_ratio/low_mean': 3.4717084645308205e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.679183808140806e-06, 'clip_ratio/high_max': 1.0716735232563224e-05, 'clip_ratio/region_mean': 3.7396268680822686e-05, 'epoch': 0.12} + + 12%|█▏ | 126/1024 [5:26:42<41:42:17, 167.19s/it]INFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:51:41 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▏ | 127/1024 [5:29:38<42:19:43, 169.88s/it] + {'loss': 0.0562, 'grad_norm': 0.00254544778726995, 'learning_rate': 1e-05, 'num_tokens': 101797124.0, 'completions/mean_length': 8421.9296875, 'completions/min_length': 1180.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8030.35205078125, 'completions/min_terminated_length': 1180.0, 'completions/max_terminated_length': 16379.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2603819966316223, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020804740488529205, 'sampling/sampling_logp_difference/max': 10.75251579284668, 'sampling/importance_sampling_ratio/min': 2.139152456948068e-05, 'sampling/importance_sampling_ratio/mean': 0.9999698400497437, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.929582305252552, 'clip_ratio/low_mean': 3.8401355027417594e-05, 'clip_ratio/low_min': 3.4494178180466406e-06, 'clip_ratio/high_mean': 1.8907661001321685e-06, 'clip_ratio/high_max': 7.563064400528674e-06, 'clip_ratio/region_mean': 4.029212129808002e-05, 'epoch': 0.12} + + 12%|█▏ | 127/1024 [5:29:38<42:19:43, 169.88s/it]INFO 12-01 18:54:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:54:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:54:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:54:38 [block_pool.py:292] Successfully reset prefix cache + + 12%|█▎ | 128/1024 [5:32:08<40:48:49, 163.98s/it] + {'loss': -0.0048, 'grad_norm': 0.0030309113208204508, 'learning_rate': 1e-05, 'num_tokens': 102643751.0, 'completions/mean_length': 6452.5859375, 'completions/min_length': 233.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6214.232421875, 'completions/min_terminated_length': 233.0, 'completions/max_terminated_length': 14871.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3453505039215088, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02046305686235428, 'sampling/sampling_logp_difference/max': 10.81167221069336, 'sampling/importance_sampling_ratio/min': 2.0162780856480822e-05, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9258717745542526, 'clip_ratio/low_mean': 3.5734614471039094e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.125810965480923e-06, 'clip_ratio/high_max': 8.503243861923693e-06, 'clip_ratio/region_mean': 3.7860425095459505e-05, 'epoch': 0.12} + + 12%|█▎ | 128/1024 [5:32:08<40:48:49, 163.98s/it]INFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 18:57:11 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 13%|█▎ | 129/1024 [5:35:10<42:05:23, 169.30s/it] + {'loss': 0.0525, 'grad_norm': 0.0028038588352501392, 'learning_rate': 1e-05, 'num_tokens': 103645849.0, 'completions/mean_length': 7655.140625, 'completions/min_length': 1095.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7373.564453125, 'completions/min_terminated_length': 1095.0, 'completions/max_terminated_length': 16323.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.24435339868068695, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022147968411445618, 'sampling/sampling_logp_difference/max': 3.781249523162842, 'sampling/importance_sampling_ratio/min': 0.022794192656874657, 'sampling/importance_sampling_ratio/mean': 0.9999130964279175, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1112212240695953, 'clip_ratio/low_mean': 2.8848363626821083e-05, 'clip_ratio/low_min': 3.2798930078570265e-06, 'clip_ratio/high_mean': 4.865382209118252e-06, 'clip_ratio/high_max': 1.4670421251139487e-05, 'clip_ratio/region_mean': 3.371374566540908e-05, 'epoch': 0.12} + + 13%|█▎ | 129/1024 [5:35:10<42:05:23, 169.30s/it]INFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:00:09 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 130/1024 [5:38:21<43:39:32, 175.81s/it] + {'loss': 0.0942, 'grad_norm': 0.003990175202488899, 'learning_rate': 1e-05, 'num_tokens': 104712987.0, 'completions/mean_length': 8166.765625, 'completions/min_length': 838.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7618.9501953125, 'completions/min_terminated_length': 838.0, 'completions/max_terminated_length': 15694.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2680353820323944, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019254228100180626, 'sampling/sampling_logp_difference/max': 10.624967575073242, 'sampling/importance_sampling_ratio/min': 2.430162021482829e-05, 'sampling/importance_sampling_ratio/mean': 0.9999572038650513, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8589507639408112, 'clip_ratio/low_mean': 2.8828401809732895e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.8828401809732895e-05, 'epoch': 0.12} + + 13%|█▎ | 130/1024 [5:38:21<43:39:32, 175.81s/it]INFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:03:20 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 131/1024 [5:40:55<41:59:10, 169.26s/it] + {'loss': 0.0481, 'grad_norm': 0.0038855294696986675, 'learning_rate': 1e-05, 'num_tokens': 105481743.0, 'completions/mean_length': 5872.40625, 'completions/min_length': 352.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5789.6376953125, 'completions/min_terminated_length': 352.0, 'completions/max_terminated_length': 15444.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3527044355869293, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021132031455636024, 'sampling/sampling_logp_difference/max': 6.312424659729004, 'sampling/importance_sampling_ratio/min': 0.0018136304570361972, 'sampling/importance_sampling_ratio/mean': 0.9999517202377319, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0606305003166199, 'clip_ratio/low_mean': 3.547307028384239e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9298730080663518e-06, 'clip_ratio/high_max': 7.719492032265407e-06, 'clip_ratio/region_mean': 3.7402943462439e-05, 'epoch': 0.12} + + 13%|█▎ | 131/1024 [5:40:55<41:59:10, 169.26s/it]INFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:05:54 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 132/1024 [5:43:33<41:07:51, 166.00s/it] + {'loss': 0.0487, 'grad_norm': 0.004712321795523167, 'learning_rate': 1e-05, 'num_tokens': 106333695.0, 'completions/mean_length': 6474.9375, 'completions/min_length': 194.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6237.1201171875, 'completions/min_terminated_length': 194.0, 'completions/max_terminated_length': 15742.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3874102830886841, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019161570817232132, 'sampling/sampling_logp_difference/max': 10.098255157470703, 'sampling/importance_sampling_ratio/min': 4.115129559068009e-05, 'sampling/importance_sampling_ratio/mean': 0.9999421834945679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8699874132871628, 'clip_ratio/low_mean': 4.114894863960217e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.614050223812228e-06, 'clip_ratio/high_max': 1.6221786609094124e-05, 'clip_ratio/region_mean': 4.6762998408667045e-05, 'epoch': 0.12} + + 13%|█▎ | 132/1024 [5:43:33<41:07:51, 166.00s/it]INFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:08:33 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 133/1024 [5:46:14<40:43:52, 164.57s/it] + {'loss': 0.0574, 'grad_norm': 0.0031310587655752897, 'learning_rate': 1e-05, 'num_tokens': 107236363.0, 'completions/mean_length': 6910.03125, 'completions/min_length': 1212.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6604.4189453125, 'completions/min_terminated_length': 1212.0, 'completions/max_terminated_length': 15841.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019823957234621048, 'sampling/sampling_logp_difference/max': 6.661808490753174, 'sampling/importance_sampling_ratio/min': 0.0012788315070793033, 'sampling/importance_sampling_ratio/mean': 1.0000447034835815, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8597542196512222, 'clip_ratio/low_mean': 2.881602637216929e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.188186724401021e-06, 'clip_ratio/high_max': 1.2752746897604084e-05, 'clip_ratio/region_mean': 3.200421309657031e-05, 'epoch': 0.12} + + 13%|█▎ | 133/1024 [5:46:14<40:43:52, 164.57s/it]INFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:11:14 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 134/1024 [5:48:58<40:38:06, 164.37s/it] + {'loss': -0.0101, 'grad_norm': 0.006233204621821642, 'learning_rate': 1e-05, 'num_tokens': 108044714.0, 'completions/mean_length': 6172.7421875, 'completions/min_length': 691.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5843.3466796875, 'completions/min_terminated_length': 691.0, 'completions/max_terminated_length': 15311.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2845958471298218, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020428352057933807, 'sampling/sampling_logp_difference/max': 6.656150817871094, 'sampling/importance_sampling_ratio/min': 0.0012860872084274888, 'sampling/importance_sampling_ratio/mean': 0.9999743700027466, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9560965895652771, 'clip_ratio/low_mean': 3.179941927555774e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.02184224665325e-06, 'clip_ratio/high_max': 1.2087368986613e-05, 'clip_ratio/region_mean': 3.482126135168073e-05, 'epoch': 0.12} + + 13%|█▎ | 134/1024 [5:48:58<40:38:06, 164.37s/it]INFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:13:58 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 135/1024 [5:51:30<39:38:11, 160.51s/it] + {'loss': 0.1106, 'grad_norm': 0.005762661807239056, 'learning_rate': 1e-05, 'num_tokens': 108862901.0, 'completions/mean_length': 6232.4609375, 'completions/min_length': 276.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5988.82421875, 'completions/min_terminated_length': 276.0, 'completions/max_terminated_length': 15737.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3748064339160919, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01772497221827507, 'sampling/sampling_logp_difference/max': 5.4041595458984375, 'sampling/importance_sampling_ratio/min': 0.004497833084315062, 'sampling/importance_sampling_ratio/mean': 0.9999505877494812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.792289063334465, 'clip_ratio/low_mean': 3.8776780229454744e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.616570095095085e-06, 'clip_ratio/high_max': 1.846628038038034e-05, 'clip_ratio/region_mean': 4.339335077929718e-05, 'epoch': 0.12} + + 13%|█▎ | 135/1024 [5:51:30<39:38:11, 160.51s/it]INFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:16:29 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 136/1024 [5:54:11<39:36:35, 160.58s/it] + {'loss': 0.088, 'grad_norm': 0.002916123950853944, 'learning_rate': 1e-05, 'num_tokens': 109544058.0, 'completions/mean_length': 5181.1015625, 'completions/min_length': 695.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5003.27783203125, 'completions/min_terminated_length': 695.0, 'completions/max_terminated_length': 15440.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.3327339291572571, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017177307978272438, 'sampling/sampling_logp_difference/max': 14.749001502990723, 'sampling/importance_sampling_ratio/min': 3.9317873756772315e-07, 'sampling/importance_sampling_ratio/mean': 0.999925971031189, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7691714614629745, 'clip_ratio/low_mean': 3.377504378931917e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.782972615023027e-06, 'clip_ratio/high_max': 1.1131890460092109e-05, 'clip_ratio/region_mean': 3.65580164043422e-05, 'epoch': 0.13} + + 13%|█▎ | 136/1024 [5:54:11<39:36:35, 160.58s/it]INFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:19:10 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 137/1024 [5:56:54<39:47:45, 161.52s/it] + {'loss': 0.0303, 'grad_norm': 0.0035183338914066553, 'learning_rate': 1e-05, 'num_tokens': 110282853.0, 'completions/mean_length': 5583.5859375, 'completions/min_length': 537.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5235.185546875, 'completions/min_terminated_length': 537.0, 'completions/max_terminated_length': 15288.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.24381661415100098, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01895858161151409, 'sampling/sampling_logp_difference/max': 6.156238079071045, 'sampling/importance_sampling_ratio/min': 0.0021202145144343376, 'sampling/importance_sampling_ratio/mean': 0.9999736547470093, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.922084204852581, 'clip_ratio/low_mean': 3.033036318811355e-05, 'clip_ratio/low_min': 3.5457974263408687e-06, 'clip_ratio/high_mean': 5.5457699090766255e-06, 'clip_ratio/high_max': 2.2183079636306502e-05, 'clip_ratio/region_mean': 3.587613309719018e-05, 'epoch': 0.13} + + 13%|█▎ | 137/1024 [5:56:54<39:47:45, 161.52s/it]INFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:21:54 [block_pool.py:292] Successfully reset prefix cache + + 13%|█▎ | 138/1024 [6:00:06<42:00:12, 170.67s/it] + {'loss': 0.0418, 'grad_norm': 0.002201368333771825, 'learning_rate': 1e-05, 'num_tokens': 111228449.0, 'completions/mean_length': 7191.71875, 'completions/min_length': 461.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6659.93359375, 'completions/min_terminated_length': 461.0, 'completions/max_terminated_length': 16255.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01915489323437214, 'sampling/sampling_logp_difference/max': 5.343695163726807, 'sampling/importance_sampling_ratio/min': 0.0047781821340322495, 'sampling/importance_sampling_ratio/mean': 0.9998859167098999, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8676051273941994, 'clip_ratio/low_mean': 2.520359919344628e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.783892558814841e-07, 'clip_ratio/high_max': 2.7135570235259365e-06, 'clip_ratio/region_mean': 2.588198810826725e-05, 'epoch': 0.13} + + 13%|█▎ | 138/1024 [6:00:06<42:00:12, 170.67s/it]INFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:25:06 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▎ | 139/1024 [6:02:49<41:20:59, 168.20s/it] + {'loss': 0.0979, 'grad_norm': 0.00720562506467104, 'learning_rate': 1e-05, 'num_tokens': 111904700.0, 'completions/mean_length': 5139.5859375, 'completions/min_length': 498.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4869.72021484375, 'completions/min_terminated_length': 498.0, 'completions/max_terminated_length': 16102.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3566659688949585, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.016763046383857727, 'sampling/sampling_logp_difference/max': 11.616515159606934, 'sampling/importance_sampling_ratio/min': 9.015951036417391e-06, 'sampling/importance_sampling_ratio/mean': 0.9999786615371704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7077975794672966, 'clip_ratio/low_mean': 4.164742210832628e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.982446049936698e-06, 'clip_ratio/high_max': 2.2828588043921627e-05, 'clip_ratio/region_mean': 4.962986872669717e-05, 'epoch': 0.13} + + 14%|█▎ | 139/1024 [6:02:49<41:20:59, 168.20s/it]INFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:27:48 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▎ | 140/1024 [6:05:55<42:39:47, 173.74s/it] + {'loss': 0.0855, 'grad_norm': 0.005594039335846901, 'learning_rate': 1e-05, 'num_tokens': 112873218.0, 'completions/mean_length': 7408.296875, 'completions/min_length': 678.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7118.7578125, 'completions/min_terminated_length': 678.0, 'completions/max_terminated_length': 15887.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2806568741798401, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018874341621994972, 'sampling/sampling_logp_difference/max': 9.749542236328125, 'sampling/importance_sampling_ratio/min': 5.832135502714664e-05, 'sampling/importance_sampling_ratio/mean': 0.9999697804450989, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8338208198547363, 'clip_ratio/low_mean': 5.0197708333143964e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.257615276197612e-06, 'clip_ratio/high_max': 1.3030461104790447e-05, 'clip_ratio/region_mean': 5.345532326828106e-05, 'epoch': 0.13} + + 14%|█▎ | 140/1024 [6:05:55<42:39:47, 173.74s/it]INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:30:55 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 141/1024 [6:09:01<43:28:25, 177.24s/it] + {'loss': 0.089, 'grad_norm': 0.0025491444393992424, 'learning_rate': 1e-05, 'num_tokens': 113869418.0, 'completions/mean_length': 7637.25, 'completions/min_length': 943.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7131.2392578125, 'completions/min_terminated_length': 943.0, 'completions/max_terminated_length': 16158.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.32641828060150146, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020018339157104492, 'sampling/sampling_logp_difference/max': 14.212298393249512, 'sampling/importance_sampling_ratio/min': 6.724766876686772e-07, 'sampling/importance_sampling_ratio/mean': 0.9999139308929443, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9943022206425667, 'clip_ratio/low_mean': 3.066379792926455e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.586851668544114e-07, 'clip_ratio/high_max': 2.6347406674176455e-06, 'clip_ratio/region_mean': 3.132248309611896e-05, 'epoch': 0.13} + + 14%|█▍ | 141/1024 [6:09:01<43:28:25, 177.24s/it]INFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:34:00 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 142/1024 [6:11:42<42:15:06, 172.46s/it] + {'loss': 0.1115, 'grad_norm': 0.003907687962055206, 'learning_rate': 1e-05, 'num_tokens': 114674257.0, 'completions/mean_length': 6144.8671875, 'completions/min_length': 1000.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6064.244140625, 'completions/min_terminated_length': 1000.0, 'completions/max_terminated_length': 16199.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.287486732006073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018300339579582214, 'sampling/sampling_logp_difference/max': 5.673813343048096, 'sampling/importance_sampling_ratio/min': 0.003434742335230112, 'sampling/importance_sampling_ratio/mean': 0.9999485611915588, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9252935722470284, 'clip_ratio/low_mean': 2.370427267806008e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.372918283479521e-06, 'clip_ratio/high_max': 1.7491673133918084e-05, 'clip_ratio/region_mean': 2.8077190734165924e-05, 'epoch': 0.13} + + 14%|█▍ | 142/1024 [6:11:42<42:15:06, 172.46s/it]INFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:36:42 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 143/1024 [6:14:09<40:21:06, 164.89s/it] + {'loss': 0.023, 'grad_norm': 0.0042014638893306255, 'learning_rate': 1e-05, 'num_tokens': 115496300.0, 'completions/mean_length': 6266.6484375, 'completions/min_length': 919.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6186.984375, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 15768.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.18884867429733276, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021998615935444832, 'sampling/sampling_logp_difference/max': 12.561980247497559, 'sampling/importance_sampling_ratio/min': 3.502686922729481e-06, 'sampling/importance_sampling_ratio/mean': 0.9999801516532898, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0926234126091003, 'clip_ratio/low_mean': 2.688816772433711e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0681611658801557e-06, 'clip_ratio/high_max': 8.272644663520623e-06, 'clip_ratio/region_mean': 2.8956328833373846e-05, 'epoch': 0.13} + + 14%|█▍ | 143/1024 [6:14:09<40:21:06, 164.89s/it]INFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:39:09 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 144/1024 [6:16:41<39:20:34, 160.95s/it] + {'loss': 0.0404, 'grad_norm': 0.0028757627587765455, 'learning_rate': 1e-05, 'num_tokens': 116333286.0, 'completions/mean_length': 6392.890625, 'completions/min_length': 559.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6234.3017578125, 'completions/min_terminated_length': 559.0, 'completions/max_terminated_length': 15504.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.35665616393089294, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019069479778409004, 'sampling/sampling_logp_difference/max': 15.27328872680664, 'sampling/importance_sampling_ratio/min': 2.327528392243039e-07, 'sampling/importance_sampling_ratio/mean': 1.0000064373016357, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9028401970863342, 'clip_ratio/low_mean': 4.51459295618406e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.860460075586161e-06, 'clip_ratio/high_max': 2.7441840302344644e-05, 'clip_ratio/region_mean': 5.200638997848728e-05, 'epoch': 0.13} + + 14%|█▍ | 144/1024 [6:16:41<39:20:34, 160.95s/it]INFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:41:41 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 145/1024 [6:19:13<38:38:38, 158.27s/it] + {'loss': 0.0858, 'grad_norm': 0.006776242982596159, 'learning_rate': 1e-05, 'num_tokens': 117158619.0, 'completions/mean_length': 6300.1640625, 'completions/min_length': 73.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6220.763671875, 'completions/min_terminated_length': 73.0, 'completions/max_terminated_length': 16183.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.29826053977012634, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022119753062725067, 'sampling/sampling_logp_difference/max': 14.249761581420898, 'sampling/importance_sampling_ratio/min': 6.477496299339691e-07, 'sampling/importance_sampling_ratio/mean': 0.9998651742935181, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.110174722969532, 'clip_ratio/low_mean': 3.626850991622632e-05, 'clip_ratio/low_min': 4.492201696848497e-06, 'clip_ratio/high_mean': 3.0424674832829623e-06, 'clip_ratio/high_max': 1.216986993313185e-05, 'clip_ratio/region_mean': 3.931097762688296e-05, 'epoch': 0.13} + + 14%|█▍ | 145/1024 [6:19:13<38:38:38, 158.27s/it]INFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:44:13 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 146/1024 [6:21:58<39:06:40, 160.37s/it] + {'loss': 0.0041, 'grad_norm': 0.003441061358898878, 'learning_rate': 1e-05, 'num_tokens': 118140579.0, 'completions/mean_length': 7482.25, 'completions/min_length': 169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7340.95263671875, 'completions/min_terminated_length': 169.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.2109375, 'rewards/accuracy_reward/std': 0.4095771610736847, 'reward': 0.2109375, 'reward_std': 0.23250605165958405, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020916422829031944, 'sampling/sampling_logp_difference/max': 11.356839179992676, 'sampling/importance_sampling_ratio/min': 1.1689271559589542e-05, 'sampling/importance_sampling_ratio/mean': 0.9999172687530518, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9957183450460434, 'clip_ratio/low_mean': 1.452984838579141e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.926812046804116e-06, 'clip_ratio/high_max': 7.707248187216464e-06, 'clip_ratio/region_mean': 1.6456660432595527e-05, 'epoch': 0.13} + + 14%|█▍ | 146/1024 [6:21:58<39:06:40, 160.37s/it]INFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:46:58 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 147/1024 [6:24:37<38:57:01, 159.89s/it] + {'loss': 0.0601, 'grad_norm': 0.0035624606534838676, 'learning_rate': 1e-05, 'num_tokens': 118982515.0, 'completions/mean_length': 6411.125, 'completions/min_length': 415.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6252.82568359375, 'completions/min_terminated_length': 415.0, 'completions/max_terminated_length': 16193.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3913620114326477, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.020998675376176834, 'sampling/sampling_logp_difference/max': 3.96539044380188, 'sampling/importance_sampling_ratio/min': 0.018960632383823395, 'sampling/importance_sampling_ratio/mean': 0.9999991655349731, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9852773621678352, 'clip_ratio/low_mean': 4.652173765862244e-05, 'clip_ratio/low_min': 8.251542112702737e-06, 'clip_ratio/high_mean': 3.4127203889511293e-06, 'clip_ratio/high_max': 1.3650881555804517e-05, 'clip_ratio/region_mean': 4.993445759282622e-05, 'epoch': 0.14} + + 14%|█▍ | 147/1024 [6:24:37<38:57:01, 159.89s/it]INFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:49:37 [block_pool.py:292] Successfully reset prefix cache + + 14%|█▍ | 148/1024 [6:27:38<40:27:19, 166.26s/it] + {'loss': 0.0756, 'grad_norm': 0.004949269350618124, 'learning_rate': 1e-05, 'num_tokens': 119851003.0, 'completions/mean_length': 6640.75, 'completions/min_length': 1204.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6326.45166015625, 'completions/min_terminated_length': 1204.0, 'completions/max_terminated_length': 15146.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01905224658548832, 'sampling/sampling_logp_difference/max': 9.749635696411133, 'sampling/importance_sampling_ratio/min': 5.8315905334893614e-05, 'sampling/importance_sampling_ratio/mean': 0.9999769926071167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8645239844918251, 'clip_ratio/low_mean': 2.3662243620492518e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.276765594113385e-06, 'clip_ratio/high_max': 1.710706237645354e-05, 'clip_ratio/region_mean': 2.7939009100919066e-05, 'epoch': 0.14} + + 14%|█▍ | 148/1024 [6:27:38<40:27:19, 166.26s/it]INFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:52:38 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▍ | 149/1024 [6:30:31<40:54:55, 168.34s/it] + {'loss': 0.1008, 'grad_norm': 0.005622676108032465, 'learning_rate': 1e-05, 'num_tokens': 120765165.0, 'completions/mean_length': 6987.953125, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6444.3798828125, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 16061.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.39796435832977295, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01969297230243683, 'sampling/sampling_logp_difference/max': 9.292106628417969, 'sampling/importance_sampling_ratio/min': 9.214873716700822e-05, 'sampling/importance_sampling_ratio/mean': 0.9999727010726929, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9469119384884834, 'clip_ratio/low_mean': 5.667686264132499e-05, 'clip_ratio/low_min': 3.2221478249994107e-06, 'clip_ratio/high_mean': 2.0922732346662087e-06, 'clip_ratio/high_max': 5.033624802308623e-06, 'clip_ratio/region_mean': 5.876913564861752e-05, 'epoch': 0.14} + + 15%|█▍ | 149/1024 [6:30:31<40:54:55, 168.34s/it]INFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:55:31 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▍ | 150/1024 [6:33:34<41:54:35, 172.63s/it] + {'loss': -0.0093, 'grad_norm': 0.0035846447572112083, 'learning_rate': 1e-05, 'num_tokens': 121749426.0, 'completions/mean_length': 7539.2265625, 'completions/min_length': 103.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6949.5751953125, 'completions/min_terminated_length': 103.0, 'completions/max_terminated_length': 16218.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.22461043298244476, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02050059661269188, 'sampling/sampling_logp_difference/max': 11.749993324279785, 'sampling/importance_sampling_ratio/min': 7.889377229730599e-06, 'sampling/importance_sampling_ratio/mean': 1.0000232458114624, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.983614593744278, 'clip_ratio/low_mean': 3.030186894648068e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8565209529697313e-06, 'clip_ratio/high_max': 4.223829364491394e-06, 'clip_ratio/region_mean': 3.21583895583899e-05, 'epoch': 0.14} + + 15%|█▍ | 150/1024 [6:33:34<41:54:35, 172.63s/it]INFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 19:58:34 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▍ | 151/1024 [6:36:14<40:54:26, 168.69s/it] + {'loss': 0.0479, 'grad_norm': 0.005333681590855122, 'learning_rate': 1e-05, 'num_tokens': 122579975.0, 'completions/mean_length': 6339.5390625, 'completions/min_length': 363.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5845.548828125, 'completions/min_terminated_length': 363.0, 'completions/max_terminated_length': 15528.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.327729195356369, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019756250083446503, 'sampling/sampling_logp_difference/max': 6.091750144958496, 'sampling/importance_sampling_ratio/min': 0.0022614477202296257, 'sampling/importance_sampling_ratio/mean': 0.9999289512634277, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9051830619573593, 'clip_ratio/low_mean': 4.44662659901951e-05, 'clip_ratio/low_min': 5.9182802942814305e-06, 'clip_ratio/high_mean': 2.6333877940487582e-06, 'clip_ratio/high_max': 1.0533551176195033e-05, 'clip_ratio/region_mean': 4.7099654238991207e-05, 'epoch': 0.14} + + 15%|█▍ | 151/1024 [6:36:14<40:54:26, 168.69s/it]INFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:01:13 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▍ | 152/1024 [6:38:49<39:54:24, 164.75s/it] + {'loss': 0.1029, 'grad_norm': 0.005628545768558979, 'learning_rate': 1e-05, 'num_tokens': 123444686.0, 'completions/mean_length': 6610.8046875, 'completions/min_length': 856.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6533.8505859375, 'completions/min_terminated_length': 856.0, 'completions/max_terminated_length': 15321.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3498311936855316, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019961554557085037, 'sampling/sampling_logp_difference/max': 5.890087127685547, 'sampling/importance_sampling_ratio/min': 0.0027667356189340353, 'sampling/importance_sampling_ratio/mean': 0.9999935030937195, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9121239259839058, 'clip_ratio/low_mean': 5.054293433204293e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4903662304277532e-06, 'clip_ratio/high_max': 5.961464921711013e-06, 'clip_ratio/region_mean': 5.2033300562470686e-05, 'epoch': 0.14} + + 15%|█▍ | 152/1024 [6:38:49<39:54:24, 164.75s/it]INFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:03:49 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▍ | 153/1024 [6:41:40<40:17:41, 166.55s/it] + {'loss': 0.0179, 'grad_norm': 0.00521192466840148, 'learning_rate': 1e-05, 'num_tokens': 124389325.0, 'completions/mean_length': 7214.5546875, 'completions/min_length': 493.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6684.0908203125, 'completions/min_terminated_length': 493.0, 'completions/max_terminated_length': 15071.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.26538968086242676, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02030467614531517, 'sampling/sampling_logp_difference/max': 3.246713638305664, 'sampling/importance_sampling_ratio/min': 0.03890184313058853, 'sampling/importance_sampling_ratio/mean': 1.0000994205474854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9393481463193893, 'clip_ratio/low_mean': 4.231535649523721e-05, 'clip_ratio/low_min': 3.3862490909086773e-06, 'clip_ratio/high_mean': 2.778689122351352e-06, 'clip_ratio/high_max': 7.918152277852641e-06, 'clip_ratio/region_mean': 4.509404539021489e-05, 'epoch': 0.14} + + 15%|█▍ | 153/1024 [6:41:40<40:17:41, 166.55s/it]INFO 12-01 20:06:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:06:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:06:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:06:40 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▌ | 154/1024 [6:44:32<40:41:12, 168.36s/it] + {'loss': 0.0557, 'grad_norm': 0.0034769594203680754, 'learning_rate': 1e-05, 'num_tokens': 125344827.0, 'completions/mean_length': 7307.296875, 'completions/min_length': 656.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6938.32470703125, 'completions/min_terminated_length': 656.0, 'completions/max_terminated_length': 15349.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.35035035014152527, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0197945274412632, 'sampling/sampling_logp_difference/max': 9.88245964050293, 'sampling/importance_sampling_ratio/min': 5.1062532293144614e-05, 'sampling/importance_sampling_ratio/mean': 0.9999738335609436, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9287968128919601, 'clip_ratio/low_mean': 4.0359405488743505e-05, 'clip_ratio/low_min': 3.400342848181026e-06, 'clip_ratio/high_mean': 3.274841219536029e-06, 'clip_ratio/high_max': 1.3099364878144115e-05, 'clip_ratio/region_mean': 4.363424682196637e-05, 'epoch': 0.14} + + 15%|█▌ | 154/1024 [6:44:32<40:41:12, 168.36s/it]INFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:09:32 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▌ | 155/1024 [6:47:15<40:14:28, 166.71s/it] + {'loss': 0.058, 'grad_norm': 0.005860861856490374, 'learning_rate': 1e-05, 'num_tokens': 126294060.0, 'completions/mean_length': 7255.5703125, 'completions/min_length': 401.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7110.6748046875, 'completions/min_terminated_length': 401.0, 'completions/max_terminated_length': 14940.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.29719966650009155, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019849762320518494, 'sampling/sampling_logp_difference/max': 6.374942779541016, 'sampling/importance_sampling_ratio/min': 0.0017037172801792622, 'sampling/importance_sampling_ratio/mean': 0.9999392032623291, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9288185387849808, 'clip_ratio/low_mean': 3.123730675724801e-05, 'clip_ratio/low_min': 4.124868155486183e-06, 'clip_ratio/high_mean': 1.607209924259223e-06, 'clip_ratio/high_max': 6.428839697036892e-06, 'clip_ratio/region_mean': 3.284451713625458e-05, 'epoch': 0.14} + + 15%|█▌ | 155/1024 [6:47:15<40:14:28, 166.71s/it]INFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:12:15 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▌ | 156/1024 [6:50:02<40:11:21, 166.68s/it] + {'loss': 0.0365, 'grad_norm': 0.004109901376068592, 'learning_rate': 1e-05, 'num_tokens': 127163746.0, 'completions/mean_length': 6662.796875, 'completions/min_length': 402.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6429.48828125, 'completions/min_terminated_length': 402.0, 'completions/max_terminated_length': 16174.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2782978415489197, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018542557954788208, 'sampling/sampling_logp_difference/max': 6.249782562255859, 'sampling/importance_sampling_ratio/min': 0.001930873841047287, 'sampling/importance_sampling_ratio/mean': 0.9998985528945923, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8072321340441704, 'clip_ratio/low_mean': 4.209472854199703e-05, 'clip_ratio/low_min': 3.21056154461985e-06, 'clip_ratio/high_mean': 2.8721049147861777e-06, 'clip_ratio/high_max': 1.148841965914471e-05, 'clip_ratio/region_mean': 4.496683322940953e-05, 'epoch': 0.14} + + 15%|█▌ | 156/1024 [6:50:02<40:11:21, 166.68s/it]INFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:15:02 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▌ | 157/1024 [6:52:51<40:18:06, 167.34s/it] + {'loss': 0.1272, 'grad_norm': 0.005437003914266825, 'learning_rate': 1e-05, 'num_tokens': 128035690.0, 'completions/mean_length': 6638.5625, 'completions/min_length': 730.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6483.87353515625, 'completions/min_terminated_length': 730.0, 'completions/max_terminated_length': 16168.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.325370192527771, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019497953355312347, 'sampling/sampling_logp_difference/max': 7.152168273925781, 'sampling/importance_sampling_ratio/min': 0.0007831641123630106, 'sampling/importance_sampling_ratio/mean': 0.9999808073043823, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9228496253490448, 'clip_ratio/low_mean': 3.845731936280572e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7114781434866018e-06, 'clip_ratio/high_max': 1.4845912573946407e-05, 'clip_ratio/region_mean': 4.216879796103967e-05, 'epoch': 0.14} + + 15%|█▌ | 157/1024 [6:52:51<40:18:06, 167.34s/it]INFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:17:50 [block_pool.py:292] Successfully reset prefix cache + + 15%|█▌ | 158/1024 [6:55:46<40:51:20, 169.84s/it] + {'loss': 0.0553, 'grad_norm': 0.004606325179338455, 'learning_rate': 1e-05, 'num_tokens': 129114487.0, 'completions/mean_length': 8279.7890625, 'completions/min_length': 1084.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7810.9501953125, 'completions/min_terminated_length': 1084.0, 'completions/max_terminated_length': 16133.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2998581528663635, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02114839106798172, 'sampling/sampling_logp_difference/max': 11.899483680725098, 'sampling/importance_sampling_ratio/min': 6.793912234570598e-06, 'sampling/importance_sampling_ratio/mean': 0.9999224543571472, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9365477114915848, 'clip_ratio/low_mean': 5.087737986286811e-05, 'clip_ratio/low_min': 1.7309419035882456e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.087737986286811e-05, 'epoch': 0.15} + + 15%|█▌ | 158/1024 [6:55:46<40:51:20, 169.84s/it]INFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:20:46 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 159/1024 [6:58:46<41:29:01, 172.65s/it] + {'loss': 0.0979, 'grad_norm': 0.0032216343097388744, 'learning_rate': 1e-05, 'num_tokens': 130011934.0, 'completions/mean_length': 6874.5546875, 'completions/min_length': 379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6406.87646484375, 'completions/min_terminated_length': 379.0, 'completions/max_terminated_length': 15157.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.28801077604293823, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01938377134501934, 'sampling/sampling_logp_difference/max': 5.874353408813477, 'sampling/importance_sampling_ratio/min': 0.0028106109239161015, 'sampling/importance_sampling_ratio/mean': 0.9999432563781738, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8596161976456642, 'clip_ratio/low_mean': 4.6293902641991735e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.731617188255768e-06, 'clip_ratio/high_max': 2.8393386855896097e-05, 'clip_ratio/region_mean': 5.402551937550015e-05, 'epoch': 0.15} + + 16%|█▌ | 159/1024 [6:58:46<41:29:01, 172.65s/it]INFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:23:45 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 160/1024 [7:01:47<42:04:04, 175.28s/it] + {'loss': 0.0401, 'grad_norm': 0.0032756594009697437, 'learning_rate': 1e-05, 'num_tokens': 130870045.0, 'completions/mean_length': 6554.3671875, 'completions/min_length': 957.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6154.78857421875, 'completions/min_terminated_length': 957.0, 'completions/max_terminated_length': 16193.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3006146550178528, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019287925213575363, 'sampling/sampling_logp_difference/max': 18.499998092651367, 'sampling/importance_sampling_ratio/min': 9.237467679668043e-09, 'sampling/importance_sampling_ratio/mean': 0.9999619722366333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9097465947270393, 'clip_ratio/low_mean': 2.8597237701433187e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.425736511213472e-06, 'clip_ratio/high_max': 9.702946044853888e-06, 'clip_ratio/region_mean': 3.1022973985272984e-05, 'epoch': 0.15} + + 16%|█▌ | 160/1024 [7:01:47<42:04:04, 175.28s/it]INFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:26:47 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 161/1024 [7:04:34<41:25:53, 172.83s/it] + {'loss': 0.069, 'grad_norm': 0.003530750283971429, 'learning_rate': 1e-05, 'num_tokens': 131812236.0, 'completions/mean_length': 7199.9921875, 'completions/min_length': 431.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6903.73388671875, 'completions/min_terminated_length': 431.0, 'completions/max_terminated_length': 15371.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.30221718549728394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02212757244706154, 'sampling/sampling_logp_difference/max': 12.864561080932617, 'sampling/importance_sampling_ratio/min': 2.5881658984872047e-06, 'sampling/importance_sampling_ratio/mean': 0.9999665021896362, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9904173016548157, 'clip_ratio/low_mean': 4.071546266004589e-05, 'clip_ratio/low_min': 2.701884795897058e-06, 'clip_ratio/high_mean': 5.969264975647093e-06, 'clip_ratio/high_max': 2.387705990258837e-05, 'clip_ratio/region_mean': 4.6684727863066655e-05, 'epoch': 0.15} + + 16%|█▌ | 161/1024 [7:04:34<41:25:53, 172.83s/it]INFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:29:34 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 162/1024 [7:07:13<40:21:26, 168.55s/it] + {'loss': 0.0287, 'grad_norm': 0.004500554408878088, 'learning_rate': 1e-05, 'num_tokens': 132711448.0, 'completions/mean_length': 6822.59375, 'completions/min_length': 139.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6670.82568359375, 'completions/min_terminated_length': 139.0, 'completions/max_terminated_length': 16281.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02111719362437725, 'sampling/sampling_logp_difference/max': 15.995189666748047, 'sampling/importance_sampling_ratio/min': 1.1307781022651398e-07, 'sampling/importance_sampling_ratio/mean': 0.9998499751091003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0052980855107307, 'clip_ratio/low_mean': 4.526082898337336e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.81041513467062e-06, 'clip_ratio/high_max': 1.924166053868248e-05, 'clip_ratio/region_mean': 5.007124354960979e-05, 'epoch': 0.15} + + 16%|█▌ | 162/1024 [7:07:13<40:21:26, 168.55s/it]INFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:32:12 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 163/1024 [7:10:16<41:21:09, 172.90s/it] + {'loss': 0.0782, 'grad_norm': 0.0020288117229938507, 'learning_rate': 1e-05, 'num_tokens': 133729832.0, 'completions/mean_length': 7792.9375, 'completions/min_length': 957.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7515.80615234375, 'completions/min_terminated_length': 957.0, 'completions/max_terminated_length': 16109.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2501322627067566, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020228523761034012, 'sampling/sampling_logp_difference/max': 6.4299726486206055, 'sampling/importance_sampling_ratio/min': 0.001612494932487607, 'sampling/importance_sampling_ratio/mean': 0.9999821782112122, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9114394783973694, 'clip_ratio/low_mean': 1.9409651486057555e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.331508196424693e-06, 'clip_ratio/high_max': 1.3326032785698771e-05, 'clip_ratio/region_mean': 2.274115956879541e-05, 'epoch': 0.15} + + 16%|█▌ | 163/1024 [7:10:16<41:21:09, 172.90s/it]INFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:35:15 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 164/1024 [7:13:02<40:49:12, 170.87s/it] + {'loss': -0.0036, 'grad_norm': 0.006685085594654083, 'learning_rate': 1e-05, 'num_tokens': 134507182.0, 'completions/mean_length': 5908.671875, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5826.18896484375, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 15171.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01997402310371399, 'sampling/sampling_logp_difference/max': 7.111015796661377, 'sampling/importance_sampling_ratio/min': 0.0008160656434483826, 'sampling/importance_sampling_ratio/mean': 0.9999651908874512, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9869658201932907, 'clip_ratio/low_mean': 2.9356229674704082e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.089760639340966e-06, 'clip_ratio/high_max': 1.2359042557363864e-05, 'clip_ratio/region_mean': 3.244599008667137e-05, 'epoch': 0.15} + + 16%|█▌ | 164/1024 [7:13:02<40:49:12, 170.87s/it]INFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:38:02 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 165/1024 [7:15:48<40:27:37, 169.57s/it] + {'loss': 0.0946, 'grad_norm': 0.003854887094348669, 'learning_rate': 1e-05, 'num_tokens': 135446382.0, 'completions/mean_length': 7188.0, 'completions/min_length': 585.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6735.7373046875, 'completions/min_terminated_length': 585.0, 'completions/max_terminated_length': 16000.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020226184278726578, 'sampling/sampling_logp_difference/max': 6.780747890472412, 'sampling/importance_sampling_ratio/min': 0.0011354254093021154, 'sampling/importance_sampling_ratio/mean': 0.9998975992202759, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9519504383206367, 'clip_ratio/low_mean': 3.215114134036412e-05, 'clip_ratio/low_min': 3.941849627153715e-06, 'clip_ratio/high_mean': 2.1278583517414518e-06, 'clip_ratio/high_max': 8.511433406965807e-06, 'clip_ratio/region_mean': 3.427900014685292e-05, 'epoch': 0.15} + + 16%|█▌ | 165/1024 [7:15:48<40:27:37, 169.57s/it]INFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:40:48 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▌ | 166/1024 [7:18:21<39:10:12, 164.35s/it] + {'loss': 0.0055, 'grad_norm': 0.006265874952077866, 'learning_rate': 1e-05, 'num_tokens': 136213233.0, 'completions/mean_length': 5843.5234375, 'completions/min_length': 251.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5676.21484375, 'completions/min_terminated_length': 251.0, 'completions/max_terminated_length': 15712.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021706756204366684, 'sampling/sampling_logp_difference/max': 6.129936218261719, 'sampling/importance_sampling_ratio/min': 0.002176719717681408, 'sampling/importance_sampling_ratio/mean': 0.9999513626098633, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9677107483148575, 'clip_ratio/low_mean': 1.9188738406228367e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.778701175680908e-06, 'clip_ratio/high_max': 7.114804702723632e-06, 'clip_ratio/region_mean': 2.0967439695596113e-05, 'epoch': 0.15} + + 16%|█▌ | 166/1024 [7:18:21<39:10:12, 164.35s/it]INFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:43:20 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▋ | 167/1024 [7:21:11<39:34:17, 166.23s/it] + {'loss': 0.0052, 'grad_norm': 0.0018056798726320267, 'learning_rate': 1e-05, 'num_tokens': 137123405.0, 'completions/mean_length': 6942.15625, 'completions/min_length': 517.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6637.58056640625, 'completions/min_terminated_length': 517.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.172288179397583, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02278529666364193, 'sampling/sampling_logp_difference/max': 3.781208038330078, 'sampling/importance_sampling_ratio/min': 0.022795137017965317, 'sampling/importance_sampling_ratio/mean': 0.9999101161956787, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.076062560081482, 'clip_ratio/low_mean': 2.429895857858355e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4804112424826599e-06, 'clip_ratio/high_max': 5.9216449699306395e-06, 'clip_ratio/region_mean': 3.910307100341015e-06, 'epoch': 0.15} + + 16%|█▋ | 167/1024 [7:21:11<39:34:17, 166.23s/it]INFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:46:11 [block_pool.py:292] Successfully reset prefix cache + + 16%|█▋ | 168/1024 [7:24:01<39:47:12, 167.33s/it] + {'loss': 0.0702, 'grad_norm': 0.002132089575752616, 'learning_rate': 1e-05, 'num_tokens': 138084464.0, 'completions/mean_length': 7368.4609375, 'completions/min_length': 660.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7001.9755859375, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3148210048675537, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020028186962008476, 'sampling/sampling_logp_difference/max': 9.874905586242676, 'sampling/importance_sampling_ratio/min': 5.144971510162577e-05, 'sampling/importance_sampling_ratio/mean': 0.999951958656311, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9278362467885017, 'clip_ratio/low_mean': 4.042915224999888e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.364482027900522e-06, 'clip_ratio/high_max': 2.8421666684153024e-05, 'clip_ratio/region_mean': 4.8793634050525725e-05, 'epoch': 0.15} + + 16%|█▋ | 168/1024 [7:24:01<39:47:12, 167.33s/it]INFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:49:01 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 169/1024 [7:26:59<40:30:51, 170.59s/it] + {'loss': 0.0708, 'grad_norm': 0.003180777421221137, 'learning_rate': 1e-05, 'num_tokens': 139164722.0, 'completions/mean_length': 8278.578125, 'completions/min_length': 1203.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 8017.11279296875, 'completions/min_terminated_length': 1203.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.29143065214157104, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020700933411717415, 'sampling/sampling_logp_difference/max': 12.29391098022461, 'sampling/importance_sampling_ratio/min': 4.579544565785909e-06, 'sampling/importance_sampling_ratio/mean': 0.9999357461929321, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9731236174702644, 'clip_ratio/low_mean': 3.8177841361175524e-05, 'clip_ratio/low_min': 9.023873644764535e-06, 'clip_ratio/high_mean': 1.7118109099101275e-06, 'clip_ratio/high_max': 6.84724363964051e-06, 'clip_ratio/region_mean': 3.988965249845933e-05, 'epoch': 0.16} + + 17%|█▋ | 169/1024 [7:26:59<40:30:51, 170.59s/it]INFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:51:59 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 170/1024 [7:29:55<40:51:20, 172.23s/it] + {'loss': 0.0955, 'grad_norm': 0.004162010736763477, 'learning_rate': 1e-05, 'num_tokens': 140109163.0, 'completions/mean_length': 7237.2578125, 'completions/min_length': 1078.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6865.43896484375, 'completions/min_terminated_length': 1078.0, 'completions/max_terminated_length': 16136.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.33903974294662476, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017928704619407654, 'sampling/sampling_logp_difference/max': 10.63192367553711, 'sampling/importance_sampling_ratio/min': 2.4133163606165908e-05, 'sampling/importance_sampling_ratio/mean': 0.9999967813491821, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7624354660511017, 'clip_ratio/low_mean': 4.41923687048984e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.712801448178652e-06, 'clip_ratio/high_max': 2.3081439849192975e-05, 'clip_ratio/region_mean': 5.190517117625859e-05, 'epoch': 0.16} + + 17%|█▋ | 170/1024 [7:29:55<40:51:20, 172.23s/it]INFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:54:55 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 171/1024 [7:32:53<41:13:03, 173.96s/it] + {'loss': 0.0442, 'grad_norm': 0.003527693450450897, 'learning_rate': 1e-05, 'num_tokens': 141063738.0, 'completions/mean_length': 7307.4296875, 'completions/min_length': 290.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7089.59228515625, 'completions/min_terminated_length': 290.0, 'completions/max_terminated_length': 15857.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.22673209011554718, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021664291620254517, 'sampling/sampling_logp_difference/max': 10.455191612243652, 'sampling/importance_sampling_ratio/min': 2.8798374842153862e-05, 'sampling/importance_sampling_ratio/mean': 0.9998871088027954, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9450376927852631, 'clip_ratio/low_mean': 2.0606968291758676e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.502144406615116e-06, 'clip_ratio/high_max': 1.8008577626460465e-05, 'clip_ratio/region_mean': 2.510911281206063e-05, 'epoch': 0.16} + + 17%|█▋ | 171/1024 [7:32:53<41:13:03, 173.96s/it]INFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 20:57:53 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 172/1024 [7:35:32<40:06:29, 169.47s/it] + {'loss': 0.0778, 'grad_norm': 0.002400327706709504, 'learning_rate': 1e-05, 'num_tokens': 141848599.0, 'completions/mean_length': 5985.9765625, 'completions/min_length': 714.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5736.42431640625, 'completions/min_terminated_length': 714.0, 'completions/max_terminated_length': 16275.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.1922685205936432, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018963739275932312, 'sampling/sampling_logp_difference/max': 18.115007400512695, 'sampling/importance_sampling_ratio/min': 1.3575387924902316e-08, 'sampling/importance_sampling_ratio/mean': 0.9999374151229858, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8568939119577408, 'clip_ratio/low_mean': 3.323748410366534e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.788794740306912e-06, 'clip_ratio/high_max': 1.9155178961227648e-05, 'clip_ratio/region_mean': 3.802627873028541e-05, 'epoch': 0.16} + + 17%|█▋ | 172/1024 [7:35:32<40:06:29, 169.47s/it]INFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:00:32 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 173/1024 [7:38:24<40:12:47, 170.11s/it] + {'loss': 0.0625, 'grad_norm': 0.003575773909687996, 'learning_rate': 1e-05, 'num_tokens': 142902666.0, 'completions/mean_length': 8078.8359375, 'completions/min_length': 594.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7810.92724609375, 'completions/min_terminated_length': 594.0, 'completions/max_terminated_length': 15111.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3022122383117676, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021685753017663956, 'sampling/sampling_logp_difference/max': 13.205151557922363, 'sampling/importance_sampling_ratio/min': 1.8410922848488553e-06, 'sampling/importance_sampling_ratio/mean': 0.9999899864196777, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0634759217500687, 'clip_ratio/low_mean': 4.1565862602510606e-05, 'clip_ratio/low_min': 6.89249168317474e-06, 'clip_ratio/high_mean': 4.978134711564053e-06, 'clip_ratio/high_max': 1.6673273876222083e-05, 'clip_ratio/region_mean': 4.654399640457996e-05, 'epoch': 0.16} + + 17%|█▋ | 173/1024 [7:38:24<40:12:47, 170.11s/it]INFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:03:24 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 174/1024 [7:41:22<40:42:25, 172.41s/it] + {'loss': 0.0364, 'grad_norm': 0.003307635197415948, 'learning_rate': 1e-05, 'num_tokens': 143967484.0, 'completions/mean_length': 8138.515625, 'completions/min_length': 660.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7588.81689453125, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 15876.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.31800350546836853, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02233392372727394, 'sampling/sampling_logp_difference/max': 2.537085771560669, 'sampling/importance_sampling_ratio/min': 0.07909657061100006, 'sampling/importance_sampling_ratio/mean': 1.0000429153442383, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0329038575291634, 'clip_ratio/low_mean': 4.288118509521155e-05, 'clip_ratio/low_min': 7.69851726545312e-06, 'clip_ratio/high_mean': 3.081458999076858e-06, 'clip_ratio/high_max': 1.2325835996307433e-05, 'clip_ratio/region_mean': 4.596264443534892e-05, 'epoch': 0.16} + + 17%|█▋ | 174/1024 [7:41:22<40:42:25, 172.41s/it]INFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:06:21 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 175/1024 [7:44:27<41:33:00, 176.18s/it] + {'loss': 0.0258, 'grad_norm': 0.0022392498794943094, 'learning_rate': 1e-05, 'num_tokens': 145028608.0, 'completions/mean_length': 8144.21875, 'completions/min_length': 828.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7878.4189453125, 'completions/min_terminated_length': 828.0, 'completions/max_terminated_length': 16324.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.20411096513271332, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0203234925866127, 'sampling/sampling_logp_difference/max': 12.749860763549805, 'sampling/importance_sampling_ratio/min': 2.9027246455370914e-06, 'sampling/importance_sampling_ratio/mean': 0.9999473094940186, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9547601044178009, 'clip_ratio/low_mean': 3.4071419804604375e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.789598162664333e-06, 'clip_ratio/high_max': 2.3158392650657333e-05, 'clip_ratio/region_mean': 3.986101773989503e-05, 'epoch': 0.16} + + 17%|█▋ | 175/1024 [7:44:27<41:33:00, 176.18s/it]INFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:09:26 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 176/1024 [7:47:01<39:58:34, 169.71s/it] + {'loss': 0.085, 'grad_norm': 0.005551324691623449, 'learning_rate': 1e-05, 'num_tokens': 145851292.0, 'completions/mean_length': 6289.40625, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6129.1748046875, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 16327.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.327729195356369, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020259611308574677, 'sampling/sampling_logp_difference/max': 5.996909141540527, 'sampling/importance_sampling_ratio/min': 0.0024864254519343376, 'sampling/importance_sampling_ratio/mean': 0.9999369382858276, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9483931511640549, 'clip_ratio/low_mean': 3.57260964847228e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.326393539282435e-06, 'clip_ratio/high_max': 1.330557415712974e-05, 'clip_ratio/region_mean': 3.905248979663156e-05, 'epoch': 0.16} + + 17%|█▋ | 176/1024 [7:47:01<39:58:34, 169.71s/it]INFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:12:01 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 177/1024 [7:50:09<41:10:47, 175.03s/it] + {'loss': 0.0757, 'grad_norm': 0.0038497373461723328, 'learning_rate': 1e-05, 'num_tokens': 147004723.0, 'completions/mean_length': 8855.9296875, 'completions/min_length': 1004.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8354.05859375, 'completions/min_terminated_length': 1004.0, 'completions/max_terminated_length': 16176.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02178027108311653, 'sampling/sampling_logp_difference/max': 7.8969340324401855, 'sampling/importance_sampling_ratio/min': 0.0003718819934874773, 'sampling/importance_sampling_ratio/mean': 1.0000008344650269, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.003264345228672, 'clip_ratio/low_mean': 5.073524926046957e-05, 'clip_ratio/low_min': 2.859953838196816e-06, 'clip_ratio/high_mean': 2.086053825678391e-06, 'clip_ratio/high_max': 8.344215302713565e-06, 'clip_ratio/region_mean': 5.282130268824403e-05, 'epoch': 0.16} + + 17%|█▋ | 177/1024 [7:50:09<41:10:47, 175.03s/it]INFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:15:08 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 178/1024 [7:53:04<41:07:55, 175.03s/it] + {'loss': 0.054, 'grad_norm': 0.005027150269597769, 'learning_rate': 1e-05, 'num_tokens': 147996190.0, 'completions/mean_length': 7574.3359375, 'completions/min_length': 856.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7434.50048828125, 'completions/min_terminated_length': 856.0, 'completions/max_terminated_length': 16199.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3316858410835266, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.020686112344264984, 'sampling/sampling_logp_difference/max': 12.769495964050293, 'sampling/importance_sampling_ratio/min': 2.846284814950195e-06, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9448538422584534, 'clip_ratio/low_mean': 4.947490833728807e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0921258939997642e-06, 'clip_ratio/high_max': 1.2368503575999057e-05, 'clip_ratio/region_mean': 5.256703434497467e-05, 'epoch': 0.16} + + 17%|█▋ | 178/1024 [7:53:04<41:07:55, 175.03s/it]INFO 12-01 21:18:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:18:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:18:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:18:04 [block_pool.py:292] Successfully reset prefix cache + + 17%|█▋ | 179/1024 [7:55:52<40:34:07, 172.84s/it] + {'loss': 0.0743, 'grad_norm': 0.00325182662345469, 'learning_rate': 1e-05, 'num_tokens': 148931006.0, 'completions/mean_length': 7162.5625, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6787.70703125, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15821.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3492894768714905, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02043815702199936, 'sampling/sampling_logp_difference/max': 15.537620544433594, 'sampling/importance_sampling_ratio/min': 1.7868870827442151e-07, 'sampling/importance_sampling_ratio/mean': 0.9999456405639648, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8928515017032623, 'clip_ratio/low_mean': 3.363430948866153e-05, 'clip_ratio/low_min': 3.5745945297094295e-06, 'clip_ratio/high_mean': 4.189188416603429e-06, 'clip_ratio/high_max': 1.6756753666413715e-05, 'clip_ratio/region_mean': 3.7823498018951796e-05, 'epoch': 0.16} + + 17%|█▋ | 179/1024 [7:55:52<40:34:07, 172.84s/it]INFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:20:51 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 180/1024 [7:58:56<41:20:23, 176.33s/it] + {'loss': 0.0388, 'grad_norm': 0.003250610316172242, 'learning_rate': 1e-05, 'num_tokens': 149968481.0, 'completions/mean_length': 7958.2109375, 'completions/min_length': 809.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7396.4921875, 'completions/min_terminated_length': 809.0, 'completions/max_terminated_length': 16163.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2858940362930298, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020478684455156326, 'sampling/sampling_logp_difference/max': 13.499983787536621, 'sampling/importance_sampling_ratio/min': 1.370981294712692e-06, 'sampling/importance_sampling_ratio/mean': 0.999974250793457, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8763524517416954, 'clip_ratio/low_mean': 2.8009484594804235e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.204079798204475e-06, 'clip_ratio/high_max': 2.08163191928179e-05, 'clip_ratio/region_mean': 3.3213564165635034e-05, 'epoch': 0.17} + + 18%|█▊ | 180/1024 [7:58:56<41:20:23, 176.33s/it]INFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:23:56 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 181/1024 [8:01:55<41:29:37, 177.20s/it] + {'loss': 0.019, 'grad_norm': 0.004865634720772505, 'learning_rate': 1e-05, 'num_tokens': 150768791.0, 'completions/mean_length': 6120.296875, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5789.20947265625, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 15728.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.24671241641044617, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01739395596086979, 'sampling/sampling_logp_difference/max': 10.249953269958496, 'sampling/importance_sampling_ratio/min': 3.535915311658755e-05, 'sampling/importance_sampling_ratio/mean': 0.9999062418937683, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7507334873080254, 'clip_ratio/low_mean': 1.937760777082076e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.918068043480162e-06, 'clip_ratio/high_max': 1.4398233361134771e-05, 'clip_ratio/region_mean': 2.4295676269048272e-05, 'epoch': 0.17} + + 18%|█▊ | 181/1024 [8:01:55<41:29:37, 177.20s/it]INFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:26:55 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 182/1024 [8:05:08<42:33:11, 181.94s/it] + {'loss': 0.0609, 'grad_norm': 0.0027805580757558346, 'learning_rate': 1e-05, 'num_tokens': 151844301.0, 'completions/mean_length': 8231.671875, 'completions/min_length': 1231.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.109375, 'completions/mean_terminated_length': 7230.5087890625, 'completions/min_terminated_length': 1231.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.35088711977005005, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019971080124378204, 'sampling/sampling_logp_difference/max': 6.454617977142334, 'sampling/importance_sampling_ratio/min': 0.0015732402680441737, 'sampling/importance_sampling_ratio/mean': 0.999957799911499, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8613645136356354, 'clip_ratio/low_mean': 5.480891331899329e-05, 'clip_ratio/low_min': 9.078275525098434e-06, 'clip_ratio/high_mean': 2.9266581691445026e-06, 'clip_ratio/high_max': 1.170663267657801e-05, 'clip_ratio/region_mean': 5.773557131760754e-05, 'epoch': 0.17} + + 18%|█▊ | 182/1024 [8:05:08<42:33:11, 181.94s/it]INFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:30:08 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 183/1024 [8:07:36<40:07:34, 171.77s/it] + {'loss': 0.0534, 'grad_norm': 0.0028903940692543983, 'learning_rate': 1e-05, 'num_tokens': 152638356.0, 'completions/mean_length': 6038.4921875, 'completions/min_length': 769.0, 'completions/max_length': 15682.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6038.4921875, 'completions/min_terminated_length': 769.0, 'completions/max_terminated_length': 15682.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3022122383117676, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019382324069738388, 'sampling/sampling_logp_difference/max': 12.374916076660156, 'sampling/importance_sampling_ratio/min': 4.2232054511259776e-06, 'sampling/importance_sampling_ratio/mean': 0.9999019503593445, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8801494240760803, 'clip_ratio/low_mean': 4.333486742780224e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.714717084018048e-06, 'clip_ratio/high_max': 1.0858868336072192e-05, 'clip_ratio/region_mean': 4.60495848528808e-05, 'epoch': 0.17} + + 18%|█▊ | 183/1024 [8:07:36<40:07:34, 171.77s/it]INFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:32:36 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 184/1024 [8:10:33<40:25:51, 173.28s/it] + {'loss': 0.0796, 'grad_norm': 0.0029546513687819242, 'learning_rate': 1e-05, 'num_tokens': 153618418.0, 'completions/mean_length': 7506.921875, 'completions/min_length': 557.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7070.34375, 'completions/min_terminated_length': 557.0, 'completions/max_terminated_length': 16235.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.3448137044906616, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01928526908159256, 'sampling/sampling_logp_difference/max': 14.616228103637695, 'sampling/importance_sampling_ratio/min': 4.4900667717229226e-07, 'sampling/importance_sampling_ratio/mean': 1.0000388622283936, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8713229671120644, 'clip_ratio/low_mean': 4.994629193788569e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.830143276038143e-06, 'clip_ratio/high_max': 7.320573104152572e-06, 'clip_ratio/region_mean': 5.177643492970674e-05, 'epoch': 0.17} + + 18%|█▊ | 184/1024 [8:10:33<40:25:51, 173.28s/it]INFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:35:33 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 185/1024 [8:13:19<39:51:20, 171.01s/it] + {'loss': 0.0837, 'grad_norm': 0.002384800696745515, 'learning_rate': 1e-05, 'num_tokens': 154502440.0, 'completions/mean_length': 6725.921875, 'completions/min_length': 253.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6649.8740234375, 'completions/min_terminated_length': 253.0, 'completions/max_terminated_length': 13999.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020737573504447937, 'sampling/sampling_logp_difference/max': 7.082281589508057, 'sampling/importance_sampling_ratio/min': 0.0008398547652177513, 'sampling/importance_sampling_ratio/mean': 0.9999340772628784, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9011344686150551, 'clip_ratio/low_mean': 2.8494011758084525e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2481475500389934e-06, 'clip_ratio/high_max': 1.2992590200155973e-05, 'clip_ratio/region_mean': 3.174215930812352e-05, 'epoch': 0.17} + + 18%|█▊ | 185/1024 [8:13:19<39:51:20, 171.01s/it]INFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:38:18 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-01 21:40:22,616 - math_verify.grader - WARNING - Timeout during comparison + + 18%|█▊ | 186/1024 [8:16:26<40:56:32, 175.89s/it] + {'loss': 0.0678, 'grad_norm': 0.0033664393704384565, 'learning_rate': 1e-05, 'num_tokens': 155454988.0, 'completions/mean_length': 7285.78125, 'completions/min_length': 1176.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6992.2900390625, 'completions/min_terminated_length': 1176.0, 'completions/max_terminated_length': 15862.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022076331079006195, 'sampling/sampling_logp_difference/max': 7.873225212097168, 'sampling/importance_sampling_ratio/min': 0.0003808041801676154, 'sampling/importance_sampling_ratio/mean': 0.999931275844574, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.028538629412651, 'clip_ratio/low_mean': 3.7723172567893926e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.414224342028319e-06, 'clip_ratio/high_max': 2.686360085135675e-05, 'clip_ratio/region_mean': 4.5137397364669596e-05, 'epoch': 0.17} + + 18%|█▊ | 186/1024 [8:16:26<40:56:32, 175.89s/it]INFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:41:26 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 187/1024 [8:19:43<42:23:03, 182.30s/it] + {'loss': 0.0995, 'grad_norm': 0.0029569920152425766, 'learning_rate': 1e-05, 'num_tokens': 156439609.0, 'completions/mean_length': 7546.1015625, 'completions/min_length': 794.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6956.90869140625, 'completions/min_terminated_length': 794.0, 'completions/max_terminated_length': 16380.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.305637001991272, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021088771522045135, 'sampling/sampling_logp_difference/max': 4.609542369842529, 'sampling/importance_sampling_ratio/min': 0.009956372901797295, 'sampling/importance_sampling_ratio/mean': 0.9999749660491943, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9216663613915443, 'clip_ratio/low_mean': 3.613749231590191e-05, 'clip_ratio/low_min': 6.27866324975912e-06, 'clip_ratio/high_mean': 2.9093872626617667e-06, 'clip_ratio/high_max': 1.1637549050647067e-05, 'clip_ratio/region_mean': 3.904687946487684e-05, 'epoch': 0.17} + + 18%|█▊ | 187/1024 [8:19:43<42:23:03, 182.30s/it]INFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:44:43 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 188/1024 [8:22:51<42:40:40, 183.78s/it] + {'loss': 0.0039, 'grad_norm': 0.0023973705247044563, 'learning_rate': 1e-05, 'num_tokens': 157343374.0, 'completions/mean_length': 6866.6015625, 'completions/min_length': 866.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6791.66162109375, 'completions/min_terminated_length': 866.0, 'completions/max_terminated_length': 16271.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2511882185935974, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021616388112306595, 'sampling/sampling_logp_difference/max': 9.502913475036621, 'sampling/importance_sampling_ratio/min': 7.46340665500611e-05, 'sampling/importance_sampling_ratio/mean': 0.9999228715896606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9553637430071831, 'clip_ratio/low_mean': 1.9624552805908024e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6212559330597287e-06, 'clip_ratio/high_max': 6.485023732238915e-06, 'clip_ratio/region_mean': 2.1245808738967753e-05, 'epoch': 0.17} + + 18%|█▊ | 188/1024 [8:22:51<42:40:40, 183.78s/it]INFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:47:50 [block_pool.py:292] Successfully reset prefix cache + + 18%|█▊ | 189/1024 [8:25:47<42:07:39, 181.63s/it] + {'loss': 0.0056, 'grad_norm': 0.0023072708863765, 'learning_rate': 1e-05, 'num_tokens': 158173719.0, 'completions/mean_length': 6335.9453125, 'completions/min_length': 469.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 5754.65283203125, 'completions/min_terminated_length': 469.0, 'completions/max_terminated_length': 14284.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018506702035665512, 'sampling/sampling_logp_difference/max': 8.732585906982422, 'sampling/importance_sampling_ratio/min': 0.0001612449559615925, 'sampling/importance_sampling_ratio/mean': 0.9998940229415894, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8574290797114372, 'clip_ratio/low_mean': 3.832016966498486e-05, 'clip_ratio/low_min': 5.240211066848133e-06, 'clip_ratio/high_mean': 2.2777185222366825e-06, 'clip_ratio/high_max': 9.11087408894673e-06, 'clip_ratio/region_mean': 4.059788818722154e-05, 'epoch': 0.17} + + 18%|█▊ | 189/1024 [8:25:47<42:07:39, 181.63s/it]INFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:50:47 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▊ | 190/1024 [8:28:55<42:30:26, 183.49s/it] + {'loss': 0.041, 'grad_norm': 0.004400993697345257, 'learning_rate': 1e-05, 'num_tokens': 159248410.0, 'completions/mean_length': 8239.8984375, 'completions/min_length': 1080.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7768.751953125, 'completions/min_terminated_length': 1080.0, 'completions/max_terminated_length': 15951.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.32325342297554016, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02096184343099594, 'sampling/sampling_logp_difference/max': 13.686293601989746, 'sampling/importance_sampling_ratio/min': 1.1379369198039058e-06, 'sampling/importance_sampling_ratio/mean': 0.9998342990875244, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8983379155397415, 'clip_ratio/low_mean': 5.610333710137638e-05, 'clip_ratio/low_min': 1.3168393707019277e-05, 'clip_ratio/high_mean': 9.993626633786334e-06, 'clip_ratio/high_max': 3.0578403084291494e-05, 'clip_ratio/region_mean': 6.609696265513776e-05, 'epoch': 0.17} + + 19%|█▊ | 190/1024 [8:28:55<42:30:26, 183.49s/it]INFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:53:55 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▊ | 191/1024 [8:31:39<41:04:22, 177.51s/it] + {'loss': 0.0723, 'grad_norm': 0.00661451555788517, 'learning_rate': 1e-05, 'num_tokens': 160109904.0, 'completions/mean_length': 6580.921875, 'completions/min_length': 727.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 5659.26513671875, 'completions/min_terminated_length': 727.0, 'completions/max_terminated_length': 13741.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3874102830886841, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017984790727496147, 'sampling/sampling_logp_difference/max': 7.927308082580566, 'sampling/importance_sampling_ratio/min': 0.00036075623938813806, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8277688398957253, 'clip_ratio/low_mean': 6.66748674120754e-05, 'clip_ratio/low_min': 1.5295650428015506e-05, 'clip_ratio/high_mean': 2.2566434836335247e-06, 'clip_ratio/high_max': 9.026573934534099e-06, 'clip_ratio/region_mean': 6.89315111230826e-05, 'epoch': 0.18} + + 19%|█▊ | 191/1024 [8:31:39<41:04:22, 177.51s/it]INFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:56:38 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▉ | 192/1024 [8:34:29<40:30:01, 175.24s/it] + {'loss': 0.0368, 'grad_norm': 0.004417019430547953, 'learning_rate': 1e-05, 'num_tokens': 161103384.0, 'completions/mean_length': 7627.0, 'completions/min_length': 1916.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7416.83251953125, 'completions/min_terminated_length': 1916.0, 'completions/max_terminated_length': 16027.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3634909689426422, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01947963796555996, 'sampling/sampling_logp_difference/max': 9.937321662902832, 'sampling/importance_sampling_ratio/min': 4.833659477299079e-05, 'sampling/importance_sampling_ratio/mean': 0.9998986721038818, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8832443356513977, 'clip_ratio/low_mean': 4.045673085784074e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8589515207168006e-06, 'clip_ratio/high_max': 7.435806082867202e-06, 'clip_ratio/region_mean': 4.2315682549087796e-05, 'epoch': 0.18} + + 19%|█▉ | 192/1024 [8:34:29<40:30:01, 175.24s/it]INFO 12-01 21:59:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:59:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:59:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 21:59:31 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 19%|█▉ | 193/1024 [8:37:33<41:05:52, 178.04s/it] + {'loss': 0.0426, 'grad_norm': 0.0030983765609562397, 'learning_rate': 1e-05, 'num_tokens': 162199765.0, 'completions/mean_length': 8426.1015625, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7965.72705078125, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 16073.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.2540663480758667, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02070600539445877, 'sampling/sampling_logp_difference/max': 6.999904155731201, 'sampling/importance_sampling_ratio/min': 0.0009119694004766643, 'sampling/importance_sampling_ratio/mean': 0.9999411106109619, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8188603445887566, 'clip_ratio/low_mean': 2.6134909091979353e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.485296079157706e-06, 'clip_ratio/high_max': 9.941184316630824e-06, 'clip_ratio/region_mean': 2.8620205910101504e-05, 'epoch': 0.18} + + 19%|█▉ | 193/1024 [8:37:33<41:05:52, 178.04s/it]INFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:02:33 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▉ | 194/1024 [8:40:32<41:06:50, 178.33s/it] + {'loss': 0.052, 'grad_norm': 0.003430198412388563, 'learning_rate': 1e-05, 'num_tokens': 163133232.0, 'completions/mean_length': 7154.2109375, 'completions/min_length': 1387.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6856.4755859375, 'completions/min_terminated_length': 1387.0, 'completions/max_terminated_length': 15904.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2120065689086914, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02190260961651802, 'sampling/sampling_logp_difference/max': 7.753361225128174, 'sampling/importance_sampling_ratio/min': 0.00042929715709760785, 'sampling/importance_sampling_ratio/mean': 1.0000275373458862, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9913735538721085, 'clip_ratio/low_mean': 3.7853451885894174e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.530347718580742e-06, 'clip_ratio/high_max': 2.612139087432297e-05, 'clip_ratio/region_mean': 4.438379949078808e-05, 'epoch': 0.18} + + 19%|█▉ | 194/1024 [8:40:32<41:06:50, 178.33s/it]INFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:05:32 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▉ | 195/1024 [8:43:39<41:39:41, 180.92s/it] + {'loss': 0.0449, 'grad_norm': 0.002780586015433073, 'learning_rate': 1e-05, 'num_tokens': 164134393.0, 'completions/mean_length': 7693.1328125, 'completions/min_length': 1077.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7412.7822265625, 'completions/min_terminated_length': 1077.0, 'completions/max_terminated_length': 16252.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.20411095023155212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021110571920871735, 'sampling/sampling_logp_difference/max': 14.848588943481445, 'sampling/importance_sampling_ratio/min': 3.559096626304381e-07, 'sampling/importance_sampling_ratio/mean': 0.9999028444290161, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9887127950787544, 'clip_ratio/low_mean': 3.384581600585079e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.960363745951327e-07, 'clip_ratio/high_max': 3.1841454983805306e-06, 'clip_ratio/region_mean': 3.4641852380445926e-05, 'epoch': 0.18} + + 19%|█▉ | 195/1024 [8:43:39<41:39:41, 180.92s/it]INFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:08:39 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▉ | 196/1024 [8:46:24<40:28:27, 175.98s/it] + {'loss': 0.0541, 'grad_norm': 0.0030156150460243225, 'learning_rate': 1e-05, 'num_tokens': 165063412.0, 'completions/mean_length': 7072.1484375, 'completions/min_length': 695.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6771.76611328125, 'completions/min_terminated_length': 695.0, 'completions/max_terminated_length': 16129.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019325289875268936, 'sampling/sampling_logp_difference/max': 12.999247550964355, 'sampling/importance_sampling_ratio/min': 2.2620308754994767e-06, 'sampling/importance_sampling_ratio/mean': 0.9998926520347595, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.861792616546154, 'clip_ratio/low_mean': 5.182203130971175e-05, 'clip_ratio/low_min': 1.5574546068819473e-05, 'clip_ratio/high_mean': 5.008155312680174e-06, 'clip_ratio/high_max': 9.770586984814145e-06, 'clip_ratio/region_mean': 5.683018616764457e-05, 'epoch': 0.18} + + 19%|█▉ | 196/1024 [8:46:24<40:28:27, 175.98s/it]INFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:11:23 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▉ | 197/1024 [8:49:20<40:26:03, 176.01s/it] + {'loss': 0.0161, 'grad_norm': 0.0034921523183584213, 'learning_rate': 1e-05, 'num_tokens': 166024306.0, 'completions/mean_length': 7353.421875, 'completions/min_length': 916.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7062.11279296875, 'completions/min_terminated_length': 916.0, 'completions/max_terminated_length': 15062.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2909066081047058, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019593238830566406, 'sampling/sampling_logp_difference/max': 7.576326847076416, 'sampling/importance_sampling_ratio/min': 0.0005124400486238301, 'sampling/importance_sampling_ratio/mean': 0.9999784231185913, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8961873054504395, 'clip_ratio/low_mean': 6.156819108582567e-05, 'clip_ratio/low_min': 5.763157332694391e-06, 'clip_ratio/high_mean': 6.455301331698138e-06, 'clip_ratio/high_max': 2.2510209873871645e-05, 'clip_ratio/region_mean': 6.802349253121065e-05, 'epoch': 0.18} + + 19%|█▉ | 197/1024 [8:49:20<40:26:03, 176.01s/it]INFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:14:19 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▉ | 198/1024 [8:52:00<39:18:16, 171.30s/it] + {'loss': 0.0635, 'grad_norm': 0.0027784397825598717, 'learning_rate': 1e-05, 'num_tokens': 166984982.0, 'completions/mean_length': 7348.03125, 'completions/min_length': 1619.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6903.63916015625, 'completions/min_terminated_length': 1619.0, 'completions/max_terminated_length': 15604.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3437528908252716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01857386901974678, 'sampling/sampling_logp_difference/max': 6.905689716339111, 'sampling/importance_sampling_ratio/min': 0.0010020677000284195, 'sampling/importance_sampling_ratio/mean': 1.0000090599060059, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.824029266834259, 'clip_ratio/low_mean': 5.347559840629401e-05, 'clip_ratio/low_min': 6.613406640099129e-06, 'clip_ratio/high_mean': 4.292725350296678e-06, 'clip_ratio/high_max': 1.3040991007073899e-05, 'clip_ratio/region_mean': 5.776832381343411e-05, 'epoch': 0.18} + + 19%|█▉ | 198/1024 [8:52:00<39:18:16, 171.30s/it]INFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:17:00 [block_pool.py:292] Successfully reset prefix cache + + 19%|█▉ | 199/1024 [8:54:49<39:05:55, 170.61s/it] + {'loss': 0.0165, 'grad_norm': 0.004110465291887522, 'learning_rate': 1e-05, 'num_tokens': 167936971.0, 'completions/mean_length': 7290.4765625, 'completions/min_length': 471.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6920.82080078125, 'completions/min_terminated_length': 471.0, 'completions/max_terminated_length': 16358.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.35901516675949097, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019696572795510292, 'sampling/sampling_logp_difference/max': 13.219663619995117, 'sampling/importance_sampling_ratio/min': 1.8145670992453233e-06, 'sampling/importance_sampling_ratio/mean': 0.9999493360519409, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8884479627013206, 'clip_ratio/low_mean': 3.2080681648949394e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0969530649163062e-05, 'clip_ratio/high_max': 3.330808067403268e-05, 'clip_ratio/region_mean': 4.3050211388617754e-05, 'epoch': 0.18} + + 19%|█▉ | 199/1024 [8:54:49<39:05:55, 170.61s/it]INFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:19:49 [block_pool.py:292] Successfully reset prefix cache + + 20%|█▉ | 200/1024 [8:57:56<40:09:34, 175.45s/it] + {'loss': 0.1147, 'grad_norm': 0.002410614863038063, 'learning_rate': 1e-05, 'num_tokens': 168955683.0, 'completions/mean_length': 7803.625, 'completions/min_length': 929.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 6833.66943359375, 'completions/min_terminated_length': 929.0, 'completions/max_terminated_length': 15824.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.29538238048553467, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018545793369412422, 'sampling/sampling_logp_difference/max': 7.035423755645752, 'sampling/importance_sampling_ratio/min': 0.0008801451185718179, 'sampling/importance_sampling_ratio/mean': 0.999977707862854, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8326860442757607, 'clip_ratio/low_mean': 3.466498992565903e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4433944076918124e-06, 'clip_ratio/high_max': 9.77357763076725e-06, 'clip_ratio/region_mean': 3.710838473125477e-05, 'epoch': 0.18} + + 20%|█▉ | 200/1024 [8:57:56<40:09:34, 175.45s/it]INFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:22:55 [block_pool.py:292] Successfully reset prefix cache + + 20%|█▉ | 201/1024 [9:00:32<38:48:24, 169.75s/it] + {'loss': 0.0499, 'grad_norm': 0.0034376555122435093, 'learning_rate': 1e-05, 'num_tokens': 169845823.0, 'completions/mean_length': 6804.34375, 'completions/min_length': 645.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6495.322265625, 'completions/min_terminated_length': 645.0, 'completions/max_terminated_length': 16272.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.31534504890441895, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020515555515885353, 'sampling/sampling_logp_difference/max': 17.850955963134766, 'sampling/importance_sampling_ratio/min': 1.767780588579626e-08, 'sampling/importance_sampling_ratio/mean': 1.0000131130218506, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9669496119022369, 'clip_ratio/low_mean': 3.4781527119776e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6505314824353263e-06, 'clip_ratio/high_max': 1.4602125929741305e-05, 'clip_ratio/region_mean': 3.8432058772741584e-05, 'epoch': 0.18} + + 20%|█▉ | 201/1024 [9:00:32<38:48:24, 169.75s/it]INFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:25:32 [block_pool.py:292] Successfully reset prefix cache + + 20%|█▉ | 202/1024 [9:03:28<39:11:15, 171.63s/it] + {'loss': 0.1046, 'grad_norm': 0.0026675171684473753, 'learning_rate': 1e-05, 'num_tokens': 170738210.0, 'completions/mean_length': 6827.9609375, 'completions/min_length': 156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6105.23583984375, 'completions/min_terminated_length': 156.0, 'completions/max_terminated_length': 16350.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.2698654532432556, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019948139786720276, 'sampling/sampling_logp_difference/max': 5.840882778167725, 'sampling/importance_sampling_ratio/min': 0.002906275913119316, 'sampling/importance_sampling_ratio/mean': 1.000019907951355, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8833946585655212, 'clip_ratio/low_mean': 3.574208744794305e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.953680618451472e-06, 'clip_ratio/high_max': 1.5814722473805887e-05, 'clip_ratio/region_mean': 3.9695768407455034e-05, 'epoch': 0.19} + + 20%|█▉ | 202/1024 [9:03:28<39:11:15, 171.63s/it]INFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:28:28 [block_pool.py:292] Successfully reset prefix cache + + 20%|█▉ | 203/1024 [9:06:14<38:42:59, 169.77s/it] + {'loss': 0.034, 'grad_norm': 0.0039620306342840195, 'learning_rate': 1e-05, 'num_tokens': 171705152.0, 'completions/mean_length': 7377.984375, 'completions/min_length': 556.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7307.07080078125, 'completions/min_terminated_length': 556.0, 'completions/max_terminated_length': 15725.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01964445412158966, 'sampling/sampling_logp_difference/max': 10.614632606506348, 'sampling/importance_sampling_ratio/min': 2.4554079573135823e-05, 'sampling/importance_sampling_ratio/mean': 0.999995231628418, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8881714344024658, 'clip_ratio/low_mean': 6.462372630267055e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.1557804593139736e-06, 'clip_ratio/high_max': 1.6623121837255894e-05, 'clip_ratio/region_mean': 6.877950727357529e-05, 'epoch': 0.19} + + 20%|█▉ | 203/1024 [9:06:14<38:42:59, 169.77s/it]INFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:31:13 [block_pool.py:292] Successfully reset prefix cache + + 20%|█▉ | 204/1024 [9:09:12<39:14:05, 172.25s/it] + {'loss': 0.0268, 'grad_norm': 0.0040458571165800095, 'learning_rate': 1e-05, 'num_tokens': 172501881.0, 'completions/mean_length': 6051.8828125, 'completions/min_length': 819.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5543.74560546875, 'completions/min_terminated_length': 819.0, 'completions/max_terminated_length': 15265.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.16781240701675415, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01957303285598755, 'sampling/sampling_logp_difference/max': 6.120361804962158, 'sampling/importance_sampling_ratio/min': 0.0021976607386022806, 'sampling/importance_sampling_ratio/mean': 0.9999410510063171, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8851477280259132, 'clip_ratio/low_mean': 2.775239624952519e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.409777835055138e-06, 'clip_ratio/high_max': 9.639111340220552e-06, 'clip_ratio/region_mean': 3.0162174198267167e-05, 'epoch': 0.19} + + 20%|█▉ | 204/1024 [9:09:12<39:14:05, 172.25s/it]INFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:34:11 [block_pool.py:292] Successfully reset prefix cache + + 20%|██ | 205/1024 [9:12:01<38:58:01, 171.28s/it] + {'loss': 0.0427, 'grad_norm': 0.005941574461758137, 'learning_rate': 1e-05, 'num_tokens': 173522391.0, 'completions/mean_length': 7830.171875, 'completions/min_length': 954.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7409.4912109375, 'completions/min_terminated_length': 954.0, 'completions/max_terminated_length': 16034.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.33668074011802673, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021295130252838135, 'sampling/sampling_logp_difference/max': 9.052275657653809, 'sampling/importance_sampling_ratio/min': 0.00011712420382536948, 'sampling/importance_sampling_ratio/mean': 1.0000017881393433, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9070459827780724, 'clip_ratio/low_mean': 5.158422732165491e-05, 'clip_ratio/low_min': 1.1939961495954776e-05, 'clip_ratio/high_mean': 3.529455852913088e-06, 'clip_ratio/high_max': 9.72708312474424e-06, 'clip_ratio/region_mean': 5.5113683174567996e-05, 'epoch': 0.19} + + 20%|██ | 205/1024 [9:12:01<38:58:01, 171.28s/it]INFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:37:00 [block_pool.py:292] Successfully reset prefix cache + + 20%|██ | 206/1024 [9:15:08<40:01:06, 176.12s/it] + {'loss': 0.0273, 'grad_norm': 0.0025851845275610685, 'learning_rate': 1e-05, 'num_tokens': 174504534.0, 'completions/mean_length': 7520.6796875, 'completions/min_length': 1321.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6769.55078125, 'completions/min_terminated_length': 1321.0, 'completions/max_terminated_length': 15443.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2188364714384079, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02016005665063858, 'sampling/sampling_logp_difference/max': 7.835196018218994, 'sampling/importance_sampling_ratio/min': 0.00039556476986035705, 'sampling/importance_sampling_ratio/mean': 0.999911367893219, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8843575045466423, 'clip_ratio/low_mean': 1.718775109793569e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3885803582525114e-06, 'clip_ratio/high_max': 5.5543214330100454e-06, 'clip_ratio/region_mean': 1.8576331683561875e-05, 'epoch': 0.19} + + 20%|██ | 206/1024 [9:15:08<40:01:06, 176.12s/it]INFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:40:08 [block_pool.py:292] Successfully reset prefix cache + + 20%|██ | 207/1024 [9:18:08<40:15:08, 177.37s/it] + {'loss': 0.047, 'grad_norm': 0.004170550964772701, 'learning_rate': 1e-05, 'num_tokens': 175472574.0, 'completions/mean_length': 7382.1875, 'completions/min_length': 934.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6861.42138671875, 'completions/min_terminated_length': 934.0, 'completions/max_terminated_length': 16173.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020749717950820923, 'sampling/sampling_logp_difference/max': 10.481352806091309, 'sampling/importance_sampling_ratio/min': 2.8054744689143263e-05, 'sampling/importance_sampling_ratio/mean': 0.9999932646751404, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.916313610970974, 'clip_ratio/low_mean': 3.617897255026037e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.536370288908074e-06, 'clip_ratio/high_max': 1.0145481155632297e-05, 'clip_ratio/region_mean': 3.871534295285528e-05, 'epoch': 0.19} + + 20%|██ | 207/1024 [9:18:08<40:15:08, 177.37s/it]INFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:43:08 [block_pool.py:292] Successfully reset prefix cache + + 20%|██ | 208/1024 [9:20:26<37:31:58, 165.59s/it] + {'loss': 0.0447, 'grad_norm': 0.004663965664803982, 'learning_rate': 1e-05, 'num_tokens': 176275568.0, 'completions/mean_length': 6122.453125, 'completions/min_length': 1192.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6041.6533203125, 'completions/min_terminated_length': 1192.0, 'completions/max_terminated_length': 13891.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3284856975078583, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020278753712773323, 'sampling/sampling_logp_difference/max': 11.74999713897705, 'sampling/importance_sampling_ratio/min': 7.88934721640544e-06, 'sampling/importance_sampling_ratio/mean': 0.9999363422393799, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8984386026859283, 'clip_ratio/low_mean': 3.83663013963087e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.83663013963087e-05, 'epoch': 0.19} + + 20%|██ | 208/1024 [9:20:26<37:31:58, 165.59s/it]INFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:45:26 [block_pool.py:292] Successfully reset prefix cache + + 20%|██ | 209/1024 [9:22:53<36:10:49, 159.82s/it] + {'loss': 0.1066, 'grad_norm': 0.004848882555961609, 'learning_rate': 1e-05, 'num_tokens': 176932549.0, 'completions/mean_length': 4983.2890625, 'completions/min_length': 589.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4709.67236328125, 'completions/min_terminated_length': 589.0, 'completions/max_terminated_length': 15547.0, 'rewards/accuracy_reward/mean': 0.6484375, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.6484375, 'reward_std': 0.2772369980812073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017959970980882645, 'sampling/sampling_logp_difference/max': 11.026308059692383, 'sampling/importance_sampling_ratio/min': 1.626804078114219e-05, 'sampling/importance_sampling_ratio/mean': 0.9999616146087646, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.825260303914547, 'clip_ratio/low_mean': 4.3961883989140915e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6337880828796187e-06, 'clip_ratio/high_max': 1.4535152331518475e-05, 'clip_ratio/region_mean': 4.7595671958333696e-05, 'epoch': 0.19} + + 20%|██ | 209/1024 [9:22:53<36:10:49, 159.82s/it]INFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:47:52 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 210/1024 [9:25:17<35:04:16, 155.11s/it] + {'loss': 0.0977, 'grad_norm': 0.004749474115669727, 'learning_rate': 1e-05, 'num_tokens': 177691752.0, 'completions/mean_length': 5766.5234375, 'completions/min_length': 700.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5511.7041015625, 'completions/min_terminated_length': 700.0, 'completions/max_terminated_length': 15415.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2738044261932373, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019118282943964005, 'sampling/sampling_logp_difference/max': 11.626367568969727, 'sampling/importance_sampling_ratio/min': 8.927558155846782e-06, 'sampling/importance_sampling_ratio/mean': 1.0000141859054565, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9016259610652924, 'clip_ratio/low_mean': 4.2418692146384274e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7854651989400736e-06, 'clip_ratio/high_max': 1.1141860795760294e-05, 'clip_ratio/region_mean': 4.5204157913758536e-05, 'epoch': 0.19} + + 21%|██ | 210/1024 [9:25:17<35:04:16, 155.11s/it]INFO 12-01 22:50:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:50:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:50:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:50:17 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 211/1024 [9:27:55<35:12:43, 155.92s/it] + {'loss': 0.1135, 'grad_norm': 0.004418120253831148, 'learning_rate': 1e-05, 'num_tokens': 178603454.0, 'completions/mean_length': 6993.671875, 'completions/min_length': 889.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6768.30419921875, 'completions/min_terminated_length': 889.0, 'completions/max_terminated_length': 15696.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01957814022898674, 'sampling/sampling_logp_difference/max': 6.312445640563965, 'sampling/importance_sampling_ratio/min': 0.0018135923892259598, 'sampling/importance_sampling_ratio/mean': 1.000037670135498, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9074988812208176, 'clip_ratio/low_mean': 4.609663824339805e-05, 'clip_ratio/low_min': 3.983555870945565e-06, 'clip_ratio/high_mean': 2.1587275114143267e-06, 'clip_ratio/high_max': 5.5243735914700665e-06, 'clip_ratio/region_mean': 4.8255366664307076e-05, 'epoch': 0.19} + + 21%|██ | 211/1024 [9:27:55<35:12:43, 155.92s/it]INFO 12-01 22:52:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:52:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:52:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:52:55 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 212/1024 [9:30:58<37:00:15, 164.06s/it] + {'loss': 0.0172, 'grad_norm': 0.00237120408564806, 'learning_rate': 1e-05, 'num_tokens': 179577063.0, 'completions/mean_length': 7445.1328125, 'completions/min_length': 24.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6849.20849609375, 'completions/min_terminated_length': 24.0, 'completions/max_terminated_length': 15316.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.21040897071361542, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02165937051177025, 'sampling/sampling_logp_difference/max': 9.245802879333496, 'sampling/importance_sampling_ratio/min': 9.651589061832055e-05, 'sampling/importance_sampling_ratio/mean': 0.9999725818634033, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9255013465881348, 'clip_ratio/low_mean': 2.7488794444252562e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2817357628591708e-06, 'clip_ratio/high_max': 5.126943051436683e-06, 'clip_ratio/region_mean': 2.877053032079857e-05, 'epoch': 0.2} + + 21%|██ | 212/1024 [9:30:58<37:00:15, 164.06s/it]INFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:55:57 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 213/1024 [9:33:50<37:32:46, 166.67s/it] + {'loss': 0.1291, 'grad_norm': 0.004715202376246452, 'learning_rate': 1e-05, 'num_tokens': 180380422.0, 'completions/mean_length': 6120.5546875, 'completions/min_length': 471.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5703.34130859375, 'completions/min_terminated_length': 471.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.29355230927467346, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018377620726823807, 'sampling/sampling_logp_difference/max': 5.437493324279785, 'sampling/importance_sampling_ratio/min': 0.004350374918431044, 'sampling/importance_sampling_ratio/mean': 0.999874472618103, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8181199952960014, 'clip_ratio/low_mean': 2.6486316301088664e-05, 'clip_ratio/low_min': 3.516273409331916e-06, 'clip_ratio/high_mean': 4.7390736881425255e-06, 'clip_ratio/high_max': 1.8956294752570102e-05, 'clip_ratio/region_mean': 3.122539010291803e-05, 'epoch': 0.2} + + 21%|██ | 213/1024 [9:33:50<37:32:46, 166.67s/it]INFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 22:58:50 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 214/1024 [9:36:30<37:01:03, 164.52s/it] + {'loss': 0.0311, 'grad_norm': 0.003063712501898408, 'learning_rate': 1e-05, 'num_tokens': 181212776.0, 'completions/mean_length': 6351.203125, 'completions/min_length': 694.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5857.78662109375, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 16005.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3048579692840576, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019523698836565018, 'sampling/sampling_logp_difference/max': 11.74971866607666, 'sampling/importance_sampling_ratio/min': 7.891544555604924e-06, 'sampling/importance_sampling_ratio/mean': 0.9999946355819702, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8798654451966286, 'clip_ratio/low_mean': 3.4097628713425365e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.376495558564784e-06, 'clip_ratio/high_max': 5.594843969447538e-06, 'clip_ratio/region_mean': 3.6474124044616474e-05, 'epoch': 0.2} + + 21%|██ | 214/1024 [9:36:30<37:01:03, 164.52s/it]INFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:01:30 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 215/1024 [9:39:02<36:08:01, 160.79s/it] + {'loss': 0.0924, 'grad_norm': 0.0033194730058312416, 'learning_rate': 1e-05, 'num_tokens': 182041910.0, 'completions/mean_length': 6330.046875, 'completions/min_length': 701.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6170.46044921875, 'completions/min_terminated_length': 701.0, 'completions/max_terminated_length': 14180.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018981872126460075, 'sampling/sampling_logp_difference/max': 9.158197402954102, 'sampling/importance_sampling_ratio/min': 0.00010535263572819531, 'sampling/importance_sampling_ratio/mean': 0.9998994469642639, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8319354206323624, 'clip_ratio/low_mean': 3.544438988001275e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.544438988001275e-05, 'epoch': 0.2} + + 21%|██ | 215/1024 [9:39:02<36:08:01, 160.79s/it]INFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:04:02 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 216/1024 [9:42:15<38:16:55, 170.56s/it] + {'loss': 0.0288, 'grad_norm': 0.004492956213653088, 'learning_rate': 1e-05, 'num_tokens': 182914843.0, 'completions/mean_length': 6665.2890625, 'completions/min_length': 722.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6351.7822265625, 'completions/min_terminated_length': 722.0, 'completions/max_terminated_length': 15982.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.14807432889938354, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02088768407702446, 'sampling/sampling_logp_difference/max': 4.474179744720459, 'sampling/importance_sampling_ratio/min': 0.011399568989872932, 'sampling/importance_sampling_ratio/mean': 1.000030279159546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9336326420307159, 'clip_ratio/low_mean': 1.7156292415165808e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.7156292415165808e-05, 'epoch': 0.2} + + 21%|██ | 216/1024 [9:42:15<38:16:55, 170.56s/it]INFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:07:15 [block_pool.py:292] Successfully reset prefix cache + + 21%|██ | 217/1024 [9:44:54<37:24:16, 166.86s/it] + {'loss': -0.004, 'grad_norm': 0.003816079581156373, 'learning_rate': 1e-05, 'num_tokens': 183628152.0, 'completions/mean_length': 5393.9140625, 'completions/min_length': 628.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5039.39501953125, 'completions/min_terminated_length': 628.0, 'completions/max_terminated_length': 16064.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.31694266200065613, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018448319286108017, 'sampling/sampling_logp_difference/max': 5.730112552642822, 'sampling/importance_sampling_ratio/min': 0.003246711567044258, 'sampling/importance_sampling_ratio/mean': 0.9998779892921448, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7864786610007286, 'clip_ratio/low_mean': 5.4809036328151706e-05, 'clip_ratio/low_min': 8.953898031904828e-06, 'clip_ratio/high_mean': 9.084843100026774e-06, 'clip_ratio/high_max': 3.2495465802639956e-05, 'clip_ratio/region_mean': 6.389387954186532e-05, 'epoch': 0.2} + + 21%|██ | 217/1024 [9:44:54<37:24:16, 166.86s/it]INFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:09:53 [block_pool.py:292] Successfully reset prefix cache + + 21%|██▏ | 218/1024 [9:47:38<37:10:50, 166.07s/it] + {'loss': 0.0617, 'grad_norm': 0.003666195785626769, 'learning_rate': 1e-05, 'num_tokens': 184562352.0, 'completions/mean_length': 7161.5, 'completions/min_length': 681.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7015.111328125, 'completions/min_terminated_length': 681.0, 'completions/max_terminated_length': 15453.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019755780696868896, 'sampling/sampling_logp_difference/max': 8.272256851196289, 'sampling/importance_sampling_ratio/min': 0.00025550799909979105, 'sampling/importance_sampling_ratio/mean': 0.9999294281005859, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.915394201874733, 'clip_ratio/low_mean': 1.6896704778446292e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1596620172203984e-06, 'clip_ratio/high_max': 8.638648068881594e-06, 'clip_ratio/region_mean': 1.9056366909353528e-05, 'epoch': 0.2} + + 21%|██▏ | 218/1024 [9:47:38<37:10:50, 166.07s/it]INFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:12:38 [block_pool.py:292] Successfully reset prefix cache + + 21%|██▏ | 219/1024 [9:50:36<37:57:19, 169.74s/it] + {'loss': 0.032, 'grad_norm': 0.0025940234772861004, 'learning_rate': 1e-05, 'num_tokens': 185606670.0, 'completions/mean_length': 7957.671875, 'completions/min_length': 96.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7685.8544921875, 'completions/min_terminated_length': 96.0, 'completions/max_terminated_length': 15408.0, 'rewards/accuracy_reward/mean': 0.1171875, 'rewards/accuracy_reward/std': 0.322907418012619, 'reward': 0.1171875, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02338646724820137, 'sampling/sampling_logp_difference/max': 7.179195404052734, 'sampling/importance_sampling_ratio/min': 0.0007622809498570859, 'sampling/importance_sampling_ratio/mean': 0.999893844127655, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1176252663135529, 'clip_ratio/low_mean': 2.49038239417132e-05, 'clip_ratio/low_min': 4.00025601265952e-06, 'clip_ratio/high_mean': 1.6062328995758435e-06, 'clip_ratio/high_max': 6.424931598303374e-06, 'clip_ratio/region_mean': 2.651005689813246e-05, 'epoch': 0.2} + + 21%|██▏ | 219/1024 [9:50:36<37:57:19, 169.74s/it]INFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:15:36 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-01 23:17:24,381 - math_verify.grader - WARNING - Timeout during comparison + + 21%|██▏ | 220/1024 [9:53:24<37:47:36, 169.22s/it] + {'loss': 0.0607, 'grad_norm': 0.004315398633480072, 'learning_rate': 1e-05, 'num_tokens': 186526883.0, 'completions/mean_length': 7060.6640625, 'completions/min_length': 1460.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6759.9111328125, 'completions/min_terminated_length': 1460.0, 'completions/max_terminated_length': 16146.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01967843994498253, 'sampling/sampling_logp_difference/max': 7.687473297119141, 'sampling/importance_sampling_ratio/min': 0.0004585353017318994, 'sampling/importance_sampling_ratio/mean': 1.000004529953003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9148540124297142, 'clip_ratio/low_mean': 4.4742550926457625e-05, 'clip_ratio/low_min': 3.5803282116830815e-06, 'clip_ratio/high_mean': 5.829163114867697e-06, 'clip_ratio/high_max': 1.9903963220713194e-05, 'clip_ratio/region_mean': 5.057171370026481e-05, 'epoch': 0.2} + + 21%|██▏ | 220/1024 [9:53:24<37:47:36, 169.22s/it]INFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:18:24 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 221/1024 [9:56:03<37:00:54, 165.95s/it] + {'loss': 0.0606, 'grad_norm': 0.0030786178540438414, 'learning_rate': 1e-05, 'num_tokens': 187397536.0, 'completions/mean_length': 6649.6640625, 'completions/min_length': 780.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6416.04052734375, 'completions/min_terminated_length': 780.0, 'completions/max_terminated_length': 15596.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020215414464473724, 'sampling/sampling_logp_difference/max': 14.929608345031738, 'sampling/importance_sampling_ratio/min': 3.2821125728332845e-07, 'sampling/importance_sampling_ratio/mean': 1.000005841255188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9298559054732323, 'clip_ratio/low_mean': 2.8967988555450574e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8677483214778476e-06, 'clip_ratio/high_max': 1.147099328591139e-05, 'clip_ratio/region_mean': 3.1835736763241584e-05, 'epoch': 0.2} + + 22%|██▏ | 221/1024 [9:56:03<37:00:54, 165.95s/it]INFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:21:02 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 222/1024 [9:59:07<38:13:40, 171.60s/it] + {'loss': 0.044, 'grad_norm': 0.002438523108139634, 'learning_rate': 1e-05, 'num_tokens': 188477778.0, 'completions/mean_length': 8292.015625, 'completions/min_length': 533.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7823.8837890625, 'completions/min_terminated_length': 533.0, 'completions/max_terminated_length': 16210.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018984414637088776, 'sampling/sampling_logp_difference/max': 5.178531169891357, 'sampling/importance_sampling_ratio/min': 0.005636279005557299, 'sampling/importance_sampling_ratio/mean': 1.0000240802764893, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8232023045420647, 'clip_ratio/low_mean': 3.249637484259438e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.692142735824746e-06, 'clip_ratio/high_max': 2.2768570943298982e-05, 'clip_ratio/region_mean': 3.8188517464732286e-05, 'epoch': 0.2} + + 22%|██▏ | 222/1024 [9:59:07<38:13:40, 171.60s/it]INFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:24:07 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 223/1024 [10:02:04<38:29:49, 173.02s/it] + {'loss': 0.0486, 'grad_norm': 0.004773247055709362, 'learning_rate': 1e-05, 'num_tokens': 189470655.0, 'completions/mean_length': 7600.9765625, 'completions/min_length': 995.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6936.71484375, 'completions/min_terminated_length': 995.0, 'completions/max_terminated_length': 15991.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3079911172389984, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018666012212634087, 'sampling/sampling_logp_difference/max': 6.624707221984863, 'sampling/importance_sampling_ratio/min': 0.001327168894931674, 'sampling/importance_sampling_ratio/mean': 0.9999308586120605, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8689917623996735, 'clip_ratio/low_mean': 2.255633432923787e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.135253556749376e-06, 'clip_ratio/high_max': 2.0840709566982696e-05, 'clip_ratio/region_mean': 2.869158777230041e-05, 'epoch': 0.21} + + 22%|██▏ | 223/1024 [10:02:04<38:29:49, 173.02s/it]INFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:27:03 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 224/1024 [10:04:52<38:09:18, 171.70s/it] + {'loss': 0.145, 'grad_norm': 0.004298723768442869, 'learning_rate': 1e-05, 'num_tokens': 190462227.0, 'completions/mean_length': 7600.34375, 'completions/min_length': 1335.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6855.96630859375, 'completions/min_terminated_length': 1335.0, 'completions/max_terminated_length': 16215.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2919674217700958, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018035393208265305, 'sampling/sampling_logp_difference/max': 9.996363639831543, 'sampling/importance_sampling_ratio/min': 4.5565320760942996e-05, 'sampling/importance_sampling_ratio/mean': 0.9999310374259949, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7636929750442505, 'clip_ratio/low_mean': 6.463955219260242e-05, 'clip_ratio/low_min': 1.0895145351241808e-05, 'clip_ratio/high_mean': 2.459364736751013e-06, 'clip_ratio/high_max': 9.837458947004052e-06, 'clip_ratio/region_mean': 6.70989177251613e-05, 'epoch': 0.21} + + 22%|██▏ | 224/1024 [10:04:52<38:09:18, 171.70s/it]INFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:29:52 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 225/1024 [10:07:43<38:01:54, 171.36s/it] + {'loss': 0.0859, 'grad_norm': 0.006741553544998169, 'learning_rate': 1e-05, 'num_tokens': 191312483.0, 'completions/mean_length': 6512.0, 'completions/min_length': 574.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6434.267578125, 'completions/min_terminated_length': 574.0, 'completions/max_terminated_length': 15151.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020878732204437256, 'sampling/sampling_logp_difference/max': 10.937172889709473, 'sampling/importance_sampling_ratio/min': 1.778468504198827e-05, 'sampling/importance_sampling_ratio/mean': 1.0000028610229492, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9043584689497948, 'clip_ratio/low_mean': 2.6516039497437305e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5151505812791584e-06, 'clip_ratio/high_max': 1.4060602325116633e-05, 'clip_ratio/region_mean': 3.003119024924672e-05, 'epoch': 0.21} + + 22%|██▏ | 225/1024 [10:07:43<38:01:54, 171.36s/it]INFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:32:42 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 226/1024 [10:10:32<37:49:32, 170.64s/it] + {'loss': 0.0515, 'grad_norm': 0.00281486171297729, 'learning_rate': 1e-05, 'num_tokens': 192251235.0, 'completions/mean_length': 7178.6875, 'completions/min_length': 847.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6565.00048828125, 'completions/min_terminated_length': 847.0, 'completions/max_terminated_length': 16339.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2240736484527588, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020196784287691116, 'sampling/sampling_logp_difference/max': 9.314308166503906, 'sampling/importance_sampling_ratio/min': 9.012543159769848e-05, 'sampling/importance_sampling_ratio/mean': 0.9999714493751526, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8899475410580635, 'clip_ratio/low_mean': 2.8831826739406097e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.339021302257606e-06, 'clip_ratio/high_max': 1.7356085209030425e-05, 'clip_ratio/region_mean': 3.317084781429003e-05, 'epoch': 0.21} + + 22%|██▏ | 226/1024 [10:10:32<37:49:32, 170.64s/it]INFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:35:31 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 227/1024 [10:13:05<36:36:25, 165.35s/it] + {'loss': 0.0781, 'grad_norm': 0.005070593673735857, 'learning_rate': 1e-05, 'num_tokens': 193116763.0, 'completions/mean_length': 6602.5625, 'completions/min_length': 927.0, 'completions/max_length': 15501.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6602.5625, 'completions/min_terminated_length': 927.0, 'completions/max_terminated_length': 15501.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020026464015245438, 'sampling/sampling_logp_difference/max': 12.812478065490723, 'sampling/importance_sampling_ratio/min': 2.726537559283315e-06, 'sampling/importance_sampling_ratio/mean': 0.9999746680259705, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9266818463802338, 'clip_ratio/low_mean': 3.0248688972278615e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.889521053679346e-06, 'clip_ratio/high_max': 1.5558084214717383e-05, 'clip_ratio/region_mean': 3.413820991227112e-05, 'epoch': 0.21} + + 22%|██▏ | 227/1024 [10:13:05<36:36:25, 165.35s/it]INFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:38:04 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 228/1024 [10:15:43<36:06:16, 163.29s/it] + {'loss': 0.0637, 'grad_norm': 0.006362155079841614, 'learning_rate': 1e-05, 'num_tokens': 194007868.0, 'completions/mean_length': 6818.8828125, 'completions/min_length': 510.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6430.056640625, 'completions/min_terminated_length': 510.0, 'completions/max_terminated_length': 16046.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01943325623869896, 'sampling/sampling_logp_difference/max': 7.55847692489624, 'sampling/importance_sampling_ratio/min': 0.0005216691642999649, 'sampling/importance_sampling_ratio/mean': 1.000009298324585, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.874519519507885, 'clip_ratio/low_mean': 2.959152834591805e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.047181854119117e-06, 'clip_ratio/high_max': 4.188727416476468e-06, 'clip_ratio/region_mean': 3.063871008635033e-05, 'epoch': 0.21} + + 22%|██▏ | 228/1024 [10:15:43<36:06:16, 163.29s/it]INFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:40:43 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 229/1024 [10:18:21<35:41:55, 161.65s/it] + {'loss': 0.1061, 'grad_norm': 0.003797185141593218, 'learning_rate': 1e-05, 'num_tokens': 194735980.0, 'completions/mean_length': 5515.625, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5343.111328125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 14536.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.34010058641433716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02120930328965187, 'sampling/sampling_logp_difference/max': 15.989612579345703, 'sampling/importance_sampling_ratio/min': 1.137102216830499e-07, 'sampling/importance_sampling_ratio/mean': 0.999911367893219, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0683523043990135, 'clip_ratio/low_mean': 6.821557258263056e-05, 'clip_ratio/low_min': 1.7265090718865395e-05, 'clip_ratio/high_mean': 2.4114777943395893e-06, 'clip_ratio/high_max': 9.645911177358357e-06, 'clip_ratio/region_mean': 7.062705049065698e-05, 'epoch': 0.21} + + 22%|██▏ | 229/1024 [10:18:21<35:41:55, 161.65s/it]INFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:43:21 [block_pool.py:292] Successfully reset prefix cache + + 22%|██▏ | 230/1024 [10:20:44<34:23:17, 155.92s/it] + {'loss': 0.0204, 'grad_norm': 0.004124365746974945, 'learning_rate': 1e-05, 'num_tokens': 195504882.0, 'completions/mean_length': 5853.546875, 'completions/min_length': 615.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5770.6298828125, 'completions/min_terminated_length': 615.0, 'completions/max_terminated_length': 14992.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3243142366409302, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017819223925471306, 'sampling/sampling_logp_difference/max': 5.717539310455322, 'sampling/importance_sampling_ratio/min': 0.0032877910416573286, 'sampling/importance_sampling_ratio/mean': 1.0000672340393066, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7975900694727898, 'clip_ratio/low_mean': 4.9151800567415194e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.4928530630604655e-06, 'clip_ratio/high_max': 2.1971412252241862e-05, 'clip_ratio/region_mean': 5.4644653801005916e-05, 'epoch': 0.21} + + 22%|██▏ | 230/1024 [10:20:44<34:23:17, 155.92s/it]INFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:45:43 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 231/1024 [10:23:34<35:17:47, 160.24s/it] + {'loss': 0.0947, 'grad_norm': 0.0024995009880512953, 'learning_rate': 1e-05, 'num_tokens': 196379306.0, 'completions/mean_length': 6686.25, 'completions/min_length': 260.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6532.31787109375, 'completions/min_terminated_length': 260.0, 'completions/max_terminated_length': 15503.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.35824593901634216, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018989525735378265, 'sampling/sampling_logp_difference/max': 10.818918228149414, 'sampling/importance_sampling_ratio/min': 2.0017207134515047e-05, 'sampling/importance_sampling_ratio/mean': 0.9999300837516785, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9018580466508865, 'clip_ratio/low_mean': 5.1467116236381116e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.843255515472265e-06, 'clip_ratio/high_max': 7.066538728395244e-06, 'clip_ratio/region_mean': 5.431037175185338e-05, 'epoch': 0.21} + + 23%|██▎ | 231/1024 [10:23:34<35:17:47, 160.24s/it]INFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:48:34 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 232/1024 [10:26:50<37:37:43, 171.04s/it] + {'loss': 0.0754, 'grad_norm': 0.004295211285352707, 'learning_rate': 1e-05, 'num_tokens': 197357397.0, 'completions/mean_length': 7487.3359375, 'completions/min_length': 1222.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7200.3466796875, 'completions/min_terminated_length': 1222.0, 'completions/max_terminated_length': 16347.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02209121733903885, 'sampling/sampling_logp_difference/max': 7.33111047744751, 'sampling/importance_sampling_ratio/min': 0.0006548459641635418, 'sampling/importance_sampling_ratio/mean': 1.000002384185791, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9890001565217972, 'clip_ratio/low_mean': 3.699686294567073e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5722979444253724e-06, 'clip_ratio/high_max': 6.652828687947476e-06, 'clip_ratio/region_mean': 3.95691608900961e-05, 'epoch': 0.21} + + 23%|██▎ | 232/1024 [10:26:50<37:37:43, 171.04s/it]INFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:51:50 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 233/1024 [10:29:40<37:29:19, 170.62s/it] + {'loss': 0.0914, 'grad_norm': 0.003119673579931259, 'learning_rate': 1e-05, 'num_tokens': 198303795.0, 'completions/mean_length': 7233.484375, 'completions/min_length': 706.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6938.30615234375, 'completions/min_terminated_length': 706.0, 'completions/max_terminated_length': 15825.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.23014704883098602, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021085180342197418, 'sampling/sampling_logp_difference/max': 3.89424467086792, 'sampling/importance_sampling_ratio/min': 0.020358745008707047, 'sampling/importance_sampling_ratio/mean': 1.0000243186950684, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9683803990483284, 'clip_ratio/low_mean': 2.9443070673096372e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5212734751912649e-06, 'clip_ratio/high_max': 6.0850939007650595e-06, 'clip_ratio/region_mean': 3.0964344205131056e-05, 'epoch': 0.21} + + 23%|██▎ | 233/1024 [10:29:40<37:29:19, 170.62s/it]INFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:54:39 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 234/1024 [10:32:10<36:06:22, 164.54s/it] + {'loss': 0.1028, 'grad_norm': 0.0033790848683565855, 'learning_rate': 1e-05, 'num_tokens': 199154735.0, 'completions/mean_length': 6457.78125, 'completions/min_length': 850.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6300.22265625, 'completions/min_terminated_length': 850.0, 'completions/max_terminated_length': 15733.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01950821653008461, 'sampling/sampling_logp_difference/max': 15.063070297241211, 'sampling/importance_sampling_ratio/min': 2.872048128210736e-07, 'sampling/importance_sampling_ratio/mean': 0.9998799562454224, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8881053999066353, 'clip_ratio/low_mean': 4.031422963635123e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9909530237782747e-06, 'clip_ratio/high_max': 7.963812095113099e-06, 'clip_ratio/region_mean': 4.23051826601295e-05, 'epoch': 0.22} + + 23%|██▎ | 234/1024 [10:32:10<36:06:22, 164.54s/it]INFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-01 23:57:10 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 235/1024 [10:35:04<36:38:48, 167.21s/it] + {'loss': 0.0179, 'grad_norm': 0.0021492803934961557, 'learning_rate': 1e-05, 'num_tokens': 200185643.0, 'completions/mean_length': 7904.40625, 'completions/min_length': 1128.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7769.81005859375, 'completions/min_terminated_length': 1128.0, 'completions/max_terminated_length': 16318.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.1820138692855835, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021201875060796738, 'sampling/sampling_logp_difference/max': 6.530262470245361, 'sampling/importance_sampling_ratio/min': 0.001458622980862856, 'sampling/importance_sampling_ratio/mean': 1.0001094341278076, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9881557524204254, 'clip_ratio/low_mean': 2.2856192117615137e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3430123380639998e-06, 'clip_ratio/high_max': 9.059622016138746e-06, 'clip_ratio/region_mean': 2.6199204512522556e-05, 'epoch': 0.22} + + 23%|██▎ | 235/1024 [10:35:04<36:38:48, 167.21s/it]INFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:00:03 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 236/1024 [10:37:50<36:34:08, 167.07s/it] + {'loss': 0.0327, 'grad_norm': 0.0037221095990389585, 'learning_rate': 1e-05, 'num_tokens': 201153114.0, 'completions/mean_length': 7414.4921875, 'completions/min_length': 949.0, 'completions/max_length': 15328.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7414.4921875, 'completions/min_terminated_length': 949.0, 'completions/max_terminated_length': 15328.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.248829185962677, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021356744691729546, 'sampling/sampling_logp_difference/max': 6.99871301651001, 'sampling/importance_sampling_ratio/min': 0.0009130563121289015, 'sampling/importance_sampling_ratio/mean': 0.9999958872795105, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9571134969592094, 'clip_ratio/low_mean': 3.018811844412994e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7490709751655231e-06, 'clip_ratio/high_max': 6.9962839006620925e-06, 'clip_ratio/region_mean': 3.193718976035598e-05, 'epoch': 0.22} + + 23%|██▎ | 236/1024 [10:37:50<36:34:08, 167.07s/it]INFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:02:50 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 237/1024 [10:40:13<34:55:30, 159.76s/it] + {'loss': 0.0641, 'grad_norm': 0.006285305600613356, 'learning_rate': 1e-05, 'num_tokens': 201933044.0, 'completions/mean_length': 5955.953125, 'completions/min_length': 1394.0, 'completions/max_length': 15835.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5955.953125, 'completions/min_terminated_length': 1394.0, 'completions/max_terminated_length': 15835.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.31011277437210083, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016975615173578262, 'sampling/sampling_logp_difference/max': 4.888189792633057, 'sampling/importance_sampling_ratio/min': 0.007535050623118877, 'sampling/importance_sampling_ratio/mean': 0.9999420642852783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.730999618768692, 'clip_ratio/low_mean': 5.4354991334548686e-05, 'clip_ratio/low_min': 6.868132004456129e-06, 'clip_ratio/high_mean': 2.8120230126660317e-06, 'clip_ratio/high_max': 1.1248092050664127e-05, 'clip_ratio/region_mean': 5.716701480196207e-05, 'epoch': 0.22} + + 23%|██▎ | 237/1024 [10:40:13<34:55:30, 159.76s/it]INFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:05:13 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 238/1024 [10:42:52<34:50:50, 159.61s/it] + {'loss': 0.0793, 'grad_norm': 0.005667983554303646, 'learning_rate': 1e-05, 'num_tokens': 202837281.0, 'completions/mean_length': 6923.3515625, 'completions/min_length': 63.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6458.0732421875, 'completions/min_terminated_length': 63.0, 'completions/max_terminated_length': 15959.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.26826781034469604, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.022059854120016098, 'sampling/sampling_logp_difference/max': 10.402952194213867, 'sampling/importance_sampling_ratio/min': 3.0342773243319243e-05, 'sampling/importance_sampling_ratio/mean': 0.999980092048645, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9938417226076126, 'clip_ratio/low_mean': 4.66828214484849e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.094216481258627e-06, 'clip_ratio/high_max': 7.226686648209579e-06, 'clip_ratio/region_mean': 4.977703792974353e-05, 'epoch': 0.22} + + 23%|██▎ | 238/1024 [10:42:52<34:50:50, 159.61s/it]INFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:07:52 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 239/1024 [10:45:18<33:52:37, 155.36s/it] + {'loss': 0.0299, 'grad_norm': 0.004052883945405483, 'learning_rate': 1e-05, 'num_tokens': 203614448.0, 'completions/mean_length': 5930.9296875, 'completions/min_length': 343.0, 'completions/max_length': 14726.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5930.9296875, 'completions/min_terminated_length': 343.0, 'completions/max_terminated_length': 14726.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018519222736358643, 'sampling/sampling_logp_difference/max': 8.79355239868164, 'sampling/importance_sampling_ratio/min': 0.00015170808183029294, 'sampling/importance_sampling_ratio/mean': 0.999989926815033, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8100385963916779, 'clip_ratio/low_mean': 4.239228087499214e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3079692280371091e-06, 'clip_ratio/high_max': 5.2318769121484365e-06, 'clip_ratio/region_mean': 4.3700250216716086e-05, 'epoch': 0.22} + + 23%|██▎ | 239/1024 [10:45:18<33:52:37, 155.36s/it]INFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:10:17 [block_pool.py:292] Successfully reset prefix cache + + 23%|██▎ | 240/1024 [10:48:38<36:46:06, 168.84s/it] + {'loss': 0.0269, 'grad_norm': 0.004494607914239168, 'learning_rate': 1e-05, 'num_tokens': 204518261.0, 'completions/mean_length': 6911.1015625, 'completions/min_length': 862.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6108.3134765625, 'completions/min_terminated_length': 862.0, 'completions/max_terminated_length': 14996.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.34033796191215515, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020527629181742668, 'sampling/sampling_logp_difference/max': 6.484711647033691, 'sampling/importance_sampling_ratio/min': 0.0015266009140759706, 'sampling/importance_sampling_ratio/mean': 0.9998886585235596, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9260227829217911, 'clip_ratio/low_mean': 5.500513248080097e-05, 'clip_ratio/low_min': 7.924934834591113e-06, 'clip_ratio/high_mean': 1.226307745127997e-06, 'clip_ratio/high_max': 4.905230980511988e-06, 'clip_ratio/region_mean': 5.6231440112242126e-05, 'epoch': 0.22} + + 23%|██▎ | 240/1024 [10:48:38<36:46:06, 168.84s/it]INFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:13:38 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▎ | 241/1024 [10:51:19<36:10:38, 166.33s/it] + {'loss': 0.0108, 'grad_norm': 0.0029451537411659956, 'learning_rate': 1e-05, 'num_tokens': 205433843.0, 'completions/mean_length': 6972.921875, 'completions/min_length': 438.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6823.5400390625, 'completions/min_terminated_length': 438.0, 'completions/max_terminated_length': 14637.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02013089321553707, 'sampling/sampling_logp_difference/max': 10.53177547454834, 'sampling/importance_sampling_ratio/min': 2.6675223125494085e-05, 'sampling/importance_sampling_ratio/mean': 1.0000104904174805, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0095533654093742, 'clip_ratio/low_mean': 4.75325257411896e-05, 'clip_ratio/low_min': 3.599504680096288e-06, 'clip_ratio/high_mean': 2.073441009997623e-06, 'clip_ratio/high_max': 8.293764039990492e-06, 'clip_ratio/region_mean': 4.960596663750039e-05, 'epoch': 0.22} + + 24%|██▎ | 241/1024 [10:51:19<36:10:38, 166.33s/it]INFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:16:18 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▎ | 242/1024 [10:54:04<36:03:14, 165.98s/it] + {'loss': 0.073, 'grad_norm': 0.003371767932549119, 'learning_rate': 1e-05, 'num_tokens': 206310296.0, 'completions/mean_length': 6706.6640625, 'completions/min_length': 892.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6313.2763671875, 'completions/min_terminated_length': 892.0, 'completions/max_terminated_length': 16103.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.3537652790546417, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019770190119743347, 'sampling/sampling_logp_difference/max': 10.431736946105957, 'sampling/importance_sampling_ratio/min': 2.948181463580113e-05, 'sampling/importance_sampling_ratio/mean': 0.9999367594718933, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8647518903017044, 'clip_ratio/low_mean': 3.86000854177837e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.382379150527413e-05, 'clip_ratio/high_max': 4.163383164268453e-05, 'clip_ratio/region_mean': 5.2423876240936806e-05, 'epoch': 0.22} + + 24%|██▎ | 242/1024 [10:54:04<36:03:14, 165.98s/it]INFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:19:03 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▎ | 243/1024 [10:56:56<36:26:03, 167.94s/it] + {'loss': 0.0281, 'grad_norm': 0.0016336971893906593, 'learning_rate': 1e-05, 'num_tokens': 207210974.0, 'completions/mean_length': 6882.609375, 'completions/min_length': 1119.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6415.32763671875, 'completions/min_terminated_length': 1119.0, 'completions/max_terminated_length': 16136.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.15650184452533722, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02139991894364357, 'sampling/sampling_logp_difference/max': 6.624994277954102, 'sampling/importance_sampling_ratio/min': 0.0013267879839986563, 'sampling/importance_sampling_ratio/mean': 0.9999210834503174, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.013342760503292, 'clip_ratio/low_mean': 2.4946740381892596e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.4946740381892596e-05, 'epoch': 0.22} + + 24%|██▎ | 243/1024 [10:56:56<36:26:03, 167.94s/it]INFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:21:56 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▍ | 244/1024 [10:59:10<34:10:30, 157.73s/it] + {'loss': 0.0542, 'grad_norm': 0.005036406684666872, 'learning_rate': 1e-05, 'num_tokens': 208021893.0, 'completions/mean_length': 6195.7421875, 'completions/min_length': 409.0, 'completions/max_length': 15203.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6195.7421875, 'completions/min_terminated_length': 409.0, 'completions/max_terminated_length': 15203.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3453505039215088, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018679853528738022, 'sampling/sampling_logp_difference/max': 5.512784957885742, 'sampling/importance_sampling_ratio/min': 0.0040348549373447895, 'sampling/importance_sampling_ratio/mean': 0.9999955892562866, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8448907434940338, 'clip_ratio/low_mean': 3.938925010515959e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7166009860811755e-06, 'clip_ratio/high_max': 1.4866403944324702e-05, 'clip_ratio/region_mean': 4.310585177336179e-05, 'epoch': 0.22} + + 24%|██▍ | 244/1024 [10:59:10<34:10:30, 157.73s/it]INFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:24:10 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▍ | 245/1024 [11:01:51<34:20:13, 158.68s/it] + {'loss': 0.0907, 'grad_norm': 0.0029643685556948185, 'learning_rate': 1e-05, 'num_tokens': 208912059.0, 'completions/mean_length': 6829.609375, 'completions/min_length': 735.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6521.40283203125, 'completions/min_terminated_length': 735.0, 'completions/max_terminated_length': 16305.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3079911172389984, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018488366156816483, 'sampling/sampling_logp_difference/max': 7.873661994934082, 'sampling/importance_sampling_ratio/min': 0.00038063788088038564, 'sampling/importance_sampling_ratio/mean': 0.9999761581420898, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8679579794406891, 'clip_ratio/low_mean': 3.422392001084518e-05, 'clip_ratio/low_min': 6.451612989621935e-06, 'clip_ratio/high_mean': 2.811220838339068e-06, 'clip_ratio/high_max': 1.1244883353356272e-05, 'clip_ratio/region_mean': 3.703514119024476e-05, 'epoch': 0.23} + + 24%|██▍ | 245/1024 [11:01:51<34:20:13, 158.68s/it]INFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:26:51 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▍ | 246/1024 [11:04:10<33:00:12, 152.71s/it] + {'loss': 0.0932, 'grad_norm': 0.0035942886024713516, 'learning_rate': 1e-05, 'num_tokens': 209627804.0, 'completions/mean_length': 5444.4453125, 'completions/min_length': 575.0, 'completions/max_length': 14503.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5444.4453125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 14503.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.338498055934906, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020146891474723816, 'sampling/sampling_logp_difference/max': 3.4484035968780518, 'sampling/importance_sampling_ratio/min': 0.03179635480046272, 'sampling/importance_sampling_ratio/mean': 0.99997478723526, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0460086688399315, 'clip_ratio/low_mean': 3.138338854569156e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.675150077877333e-06, 'clip_ratio/high_max': 2.2700600311509334e-05, 'clip_ratio/region_mean': 3.705853873725573e-05, 'epoch': 0.23} + + 24%|██▍ | 246/1024 [11:04:10<33:00:12, 152.71s/it]INFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:29:09 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▍ | 247/1024 [11:07:06<34:27:59, 159.69s/it] + {'loss': 0.0699, 'grad_norm': 0.0044983453117311, 'learning_rate': 1e-05, 'num_tokens': 210630150.0, 'completions/mean_length': 7657.390625, 'completions/min_length': 1048.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7152.544921875, 'completions/min_terminated_length': 1048.0, 'completions/max_terminated_length': 16244.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.26249876618385315, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02131088823080063, 'sampling/sampling_logp_difference/max': 10.158285140991211, 'sampling/importance_sampling_ratio/min': 3.8753667467972264e-05, 'sampling/importance_sampling_ratio/mean': 1.0000007152557373, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9528728649020195, 'clip_ratio/low_mean': 5.265122354103369e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.552578502625693e-06, 'clip_ratio/high_max': 1.477029400120955e-05, 'clip_ratio/region_mean': 5.720380158891203e-05, 'epoch': 0.23} + + 24%|██▍ | 247/1024 [11:07:06<34:27:59, 159.69s/it]INFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:32:05 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▍ | 248/1024 [11:09:55<35:01:14, 162.47s/it] + {'loss': 0.0566, 'grad_norm': 0.006095650140196085, 'learning_rate': 1e-05, 'num_tokens': 211620355.0, 'completions/mean_length': 7574.3515625, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7504.984375, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 16284.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021727774292230606, 'sampling/sampling_logp_difference/max': 6.575083255767822, 'sampling/importance_sampling_ratio/min': 0.0013946897815912962, 'sampling/importance_sampling_ratio/mean': 1.0000433921813965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0009776800870895, 'clip_ratio/low_mean': 2.2759413695894182e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.151910678094282e-06, 'clip_ratio/high_max': 8.607642712377128e-06, 'clip_ratio/region_mean': 2.491132454451872e-05, 'epoch': 0.23} + + 24%|██▍ | 248/1024 [11:09:55<35:01:14, 162.47s/it]INFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:34:54 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▍ | 249/1024 [11:12:51<35:51:41, 166.58s/it] + {'loss': 0.0364, 'grad_norm': 0.0037038614973425865, 'learning_rate': 1e-05, 'num_tokens': 212654747.0, 'completions/mean_length': 7919.6875, 'completions/min_length': 1517.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7716.54443359375, 'completions/min_terminated_length': 1517.0, 'completions/max_terminated_length': 14915.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.022051017731428146, 'sampling/sampling_logp_difference/max': 5.157684326171875, 'sampling/importance_sampling_ratio/min': 0.0057550109922885895, 'sampling/importance_sampling_ratio/mean': 0.9999381899833679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0405654236674309, 'clip_ratio/low_mean': 5.936152001595474e-05, 'clip_ratio/low_min': 9.155588486464694e-06, 'clip_ratio/high_mean': 5.141430960975413e-06, 'clip_ratio/high_max': 1.764823082339717e-05, 'clip_ratio/region_mean': 6.450295177273802e-05, 'epoch': 0.23} + + 24%|██▍ | 249/1024 [11:12:51<35:51:41, 166.58s/it]INFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:37:51 [block_pool.py:292] Successfully reset prefix cache + + 24%|██▍ | 250/1024 [11:16:22<38:42:46, 180.06s/it] + {'loss': 0.0571, 'grad_norm': 0.00325607368722558, 'learning_rate': 1e-05, 'num_tokens': 213774584.0, 'completions/mean_length': 8613.4765625, 'completions/min_length': 694.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7735.0693359375, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 16122.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.33668074011802673, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020002499222755432, 'sampling/sampling_logp_difference/max': 10.999996185302734, 'sampling/importance_sampling_ratio/min': 1.670176425250247e-05, 'sampling/importance_sampling_ratio/mean': 1.000060796737671, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.890489287674427, 'clip_ratio/low_mean': 4.716233138424286e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1631356023353874e-06, 'clip_ratio/high_max': 1.265254240934155e-05, 'clip_ratio/region_mean': 5.032546687289141e-05, 'epoch': 0.23} + + 24%|██▍ | 250/1024 [11:16:22<38:42:46, 180.06s/it]INFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:41:22 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▍ | 251/1024 [11:19:29<39:03:49, 181.93s/it] + {'loss': 0.0717, 'grad_norm': 0.0038265211042016745, 'learning_rate': 1e-05, 'num_tokens': 214728371.0, 'completions/mean_length': 7324.8984375, 'completions/min_length': 704.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6473.1884765625, 'completions/min_terminated_length': 704.0, 'completions/max_terminated_length': 16022.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.32719239592552185, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018367979675531387, 'sampling/sampling_logp_difference/max': 8.095518112182617, 'sampling/importance_sampling_ratio/min': 0.0003049026126973331, 'sampling/importance_sampling_ratio/mean': 1.0000168085098267, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.761004202067852, 'clip_ratio/low_mean': 3.880500707964529e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.101151375834888e-06, 'clip_ratio/high_max': 1.6404605503339553e-05, 'clip_ratio/region_mean': 4.2906158682853857e-05, 'epoch': 0.23} + + 25%|██▍ | 251/1024 [11:19:29<39:03:49, 181.93s/it]INFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:44:28 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▍ | 252/1024 [11:21:50<36:22:09, 169.60s/it] + {'loss': 0.0158, 'grad_norm': 0.002729539293795824, 'learning_rate': 1e-05, 'num_tokens': 215570806.0, 'completions/mean_length': 6422.0859375, 'completions/min_length': 373.0, 'completions/max_length': 14167.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6422.0859375, 'completions/min_terminated_length': 373.0, 'completions/max_terminated_length': 14167.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.25620076060295105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021903935819864273, 'sampling/sampling_logp_difference/max': 3.637866497039795, 'sampling/importance_sampling_ratio/min': 0.026308411732316017, 'sampling/importance_sampling_ratio/mean': 0.9999935030937195, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9946094751358032, 'clip_ratio/low_mean': 3.6433707123251224e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4061374713492114e-06, 'clip_ratio/high_max': 5.624549885396846e-06, 'clip_ratio/region_mean': 3.7839844594600436e-05, 'epoch': 0.23} + + 25%|██▍ | 252/1024 [11:21:50<36:22:09, 169.60s/it]INFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:46:49 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▍ | 253/1024 [11:24:38<36:13:06, 169.11s/it] + {'loss': 0.1011, 'grad_norm': 0.004974282346665859, 'learning_rate': 1e-05, 'num_tokens': 216465635.0, 'completions/mean_length': 6845.2890625, 'completions/min_length': 1252.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6693.88134765625, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 15585.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.30061954259872437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019389234483242035, 'sampling/sampling_logp_difference/max': 9.343890190124512, 'sampling/importance_sampling_ratio/min': 8.749838889343664e-05, 'sampling/importance_sampling_ratio/mean': 1.0000090599060059, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8822609707713127, 'clip_ratio/low_mean': 3.17277934982485e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8094962115355884e-06, 'clip_ratio/high_max': 7.2379848461423535e-06, 'clip_ratio/region_mean': 3.353728982347093e-05, 'epoch': 0.23} + + 25%|██▍ | 253/1024 [11:24:38<36:13:06, 169.11s/it]INFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:49:37 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▍ | 254/1024 [11:27:43<37:12:25, 173.96s/it] + {'loss': 0.0538, 'grad_norm': 0.0033159854356199503, 'learning_rate': 1e-05, 'num_tokens': 217485089.0, 'completions/mean_length': 7805.484375, 'completions/min_length': 435.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7528.7578125, 'completions/min_terminated_length': 435.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.33114904165267944, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021925684064626694, 'sampling/sampling_logp_difference/max': 9.437499046325684, 'sampling/importance_sampling_ratio/min': 7.967943383846432e-05, 'sampling/importance_sampling_ratio/mean': 0.9999412298202515, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9977599084377289, 'clip_ratio/low_mean': 4.096964960353944e-05, 'clip_ratio/low_min': 1.7403560605089297e-05, 'clip_ratio/high_mean': 3.9648204506193e-06, 'clip_ratio/high_max': 1.58592818024772e-05, 'clip_ratio/region_mean': 4.49344687467601e-05, 'epoch': 0.23} + + 25%|██▍ | 254/1024 [11:27:43<37:12:25, 173.96s/it]INFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:52:42 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▍ | 255/1024 [11:30:41<37:27:39, 175.37s/it] + {'loss': 0.0775, 'grad_norm': 0.0034952745772898197, 'learning_rate': 1e-05, 'num_tokens': 218496040.0, 'completions/mean_length': 7737.5546875, 'completions/min_length': 713.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7530.04052734375, 'completions/min_terminated_length': 713.0, 'completions/max_terminated_length': 15681.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3085102438926697, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019742710515856743, 'sampling/sampling_logp_difference/max': 9.606889724731445, 'sampling/importance_sampling_ratio/min': 6.726370338583365e-05, 'sampling/importance_sampling_ratio/mean': 0.9999128580093384, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8667014688253403, 'clip_ratio/low_mean': 4.044636898470344e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.566349389278912e-06, 'clip_ratio/high_max': 1.8265397557115648e-05, 'clip_ratio/region_mean': 4.5012717691861326e-05, 'epoch': 0.23} + + 25%|██▍ | 255/1024 [11:30:41<37:27:39, 175.37s/it]INFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:55:41 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▌ | 256/1024 [11:33:39<37:32:41, 175.99s/it] + {'loss': 0.0667, 'grad_norm': 0.0038676802068948746, 'learning_rate': 1e-05, 'num_tokens': 219459140.0, 'completions/mean_length': 7365.84375, 'completions/min_length': 744.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 6601.59326171875, 'completions/min_terminated_length': 744.0, 'completions/max_terminated_length': 15858.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018882082775235176, 'sampling/sampling_logp_difference/max': 8.360733985900879, 'sampling/importance_sampling_ratio/min': 0.00023387260443996638, 'sampling/importance_sampling_ratio/mean': 0.9999598264694214, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8151945173740387, 'clip_ratio/low_mean': 3.204250072030845e-05, 'clip_ratio/low_min': 3.323495775475749e-06, 'clip_ratio/high_mean': 2.0610737010429148e-06, 'clip_ratio/high_max': 8.244294804171659e-06, 'clip_ratio/region_mean': 3.410357436450795e-05, 'epoch': 0.24} + + 25%|██▌ | 256/1024 [11:33:39<37:32:41, 175.99s/it]INFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:58:41 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 25%|██▌ | 257/1024 [11:36:16<36:16:30, 170.26s/it] + {'loss': 0.1082, 'grad_norm': 0.004310046322643757, 'learning_rate': 1e-05, 'num_tokens': 220304605.0, 'completions/mean_length': 6448.0078125, 'completions/min_length': 1128.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6369.771484375, 'completions/min_terminated_length': 1128.0, 'completions/max_terminated_length': 14556.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.35611939430236816, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020253397524356842, 'sampling/sampling_logp_difference/max': 8.99997615814209, 'sampling/importance_sampling_ratio/min': 0.0001234127557836473, 'sampling/importance_sampling_ratio/mean': 0.9999396800994873, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9546648040413857, 'clip_ratio/low_mean': 5.435333650893881e-05, 'clip_ratio/low_min': 5.33937054569833e-06, 'clip_ratio/high_mean': 2.9462287329806713e-06, 'clip_ratio/high_max': 6.87833608026267e-06, 'clip_ratio/region_mean': 5.729956546929316e-05, 'epoch': 0.24} + + 25%|██▌ | 257/1024 [11:36:16<36:16:30, 170.26s/it]INFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:01:15 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▌ | 258/1024 [11:39:35<38:05:11, 179.00s/it] + {'loss': 0.042, 'grad_norm': 0.0026646999176591635, 'learning_rate': 1e-05, 'num_tokens': 221281968.0, 'completions/mean_length': 7457.6484375, 'completions/min_length': 604.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6941.24755859375, 'completions/min_terminated_length': 604.0, 'completions/max_terminated_length': 16037.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.2012200653553009, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019208962097764015, 'sampling/sampling_logp_difference/max': 12.749988555908203, 'sampling/importance_sampling_ratio/min': 2.902353571698768e-06, 'sampling/importance_sampling_ratio/mean': 0.9999173283576965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8182889074087143, 'clip_ratio/low_mean': 2.5416685957679874e-05, 'clip_ratio/low_min': 5.5736391004757024e-06, 'clip_ratio/high_mean': 1.5490235227844096e-06, 'clip_ratio/high_max': 6.196094091137638e-06, 'clip_ratio/region_mean': 2.696570959415112e-05, 'epoch': 0.24} + + 25%|██▌ | 258/1024 [11:39:35<38:05:11, 179.00s/it]INFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:04:35 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▌ | 259/1024 [11:42:39<38:19:18, 180.34s/it] + {'loss': 0.0131, 'grad_norm': 0.0016026750672608614, 'learning_rate': 1e-05, 'num_tokens': 222399046.0, 'completions/mean_length': 8561.109375, 'completions/min_length': 558.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7969.79052734375, 'completions/min_terminated_length': 558.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02173236384987831, 'sampling/sampling_logp_difference/max': 13.312499046325684, 'sampling/importance_sampling_ratio/min': 1.653693971093162e-06, 'sampling/importance_sampling_ratio/mean': 1.000004529953003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9581378549337387, 'clip_ratio/low_mean': 3.127787306311802e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.047383754368639e-06, 'clip_ratio/high_max': 1.6189535017474554e-05, 'clip_ratio/region_mean': 3.532525670379982e-05, 'epoch': 0.24} + + 25%|██▌ | 259/1024 [11:42:39<38:19:18, 180.34s/it]INFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:07:38 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▌ | 260/1024 [11:45:47<38:46:26, 182.70s/it] + {'loss': 0.0845, 'grad_norm': 0.005460259038954973, 'learning_rate': 1e-05, 'num_tokens': 223335010.0, 'completions/mean_length': 7152.34375, 'completions/min_length': 130.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7079.6533203125, 'completions/min_terminated_length': 130.0, 'completions/max_terminated_length': 16239.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3356297016143799, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01986619457602501, 'sampling/sampling_logp_difference/max': 4.589165210723877, 'sampling/importance_sampling_ratio/min': 0.010161337442696095, 'sampling/importance_sampling_ratio/mean': 0.9999966621398926, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9052041247487068, 'clip_ratio/low_mean': 5.2955770115659107e-05, 'clip_ratio/low_min': 3.402656830076012e-06, 'clip_ratio/high_mean': 4.3255887476334465e-06, 'clip_ratio/high_max': 1.4200771602190798e-05, 'clip_ratio/region_mean': 5.7281358749605715e-05, 'epoch': 0.24} + + 25%|██▌ | 260/1024 [11:45:47<38:46:26, 182.70s/it]INFO 12-02 01:10:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:10:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:10:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:10:47 [block_pool.py:292] Successfully reset prefix cache + + 25%|██▌ | 261/1024 [11:48:22<36:58:10, 174.43s/it] + {'loss': 0.0966, 'grad_norm': 0.005933742038905621, 'learning_rate': 1e-05, 'num_tokens': 224207006.0, 'completions/mean_length': 6678.65625, 'completions/min_length': 963.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6524.603515625, 'completions/min_terminated_length': 963.0, 'completions/max_terminated_length': 15631.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3316681981086731, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019827336072921753, 'sampling/sampling_logp_difference/max': 6.747769355773926, 'sampling/importance_sampling_ratio/min': 0.0011734943836927414, 'sampling/importance_sampling_ratio/mean': 1.000031590461731, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9043187350034714, 'clip_ratio/low_mean': 3.81288905373367e-05, 'clip_ratio/low_min': 8.099272235995159e-06, 'clip_ratio/high_mean': 3.5875787034456152e-06, 'clip_ratio/high_max': 1.4350314813782461e-05, 'clip_ratio/region_mean': 4.1716469809216505e-05, 'epoch': 0.24} + + 25%|██▌ | 261/1024 [11:48:22<36:58:10, 174.43s/it]INFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:13:22 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▌ | 262/1024 [11:51:06<36:15:39, 171.31s/it] + {'loss': 0.104, 'grad_norm': 0.003635740838944912, 'learning_rate': 1e-05, 'num_tokens': 225122891.0, 'completions/mean_length': 6999.0390625, 'completions/min_length': 990.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6850.07177734375, 'completions/min_terminated_length': 990.0, 'completions/max_terminated_length': 15972.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.323777437210083, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018912551924586296, 'sampling/sampling_logp_difference/max': 10.987512588500977, 'sampling/importance_sampling_ratio/min': 1.6911570128286257e-05, 'sampling/importance_sampling_ratio/mean': 0.9999303817749023, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8109970837831497, 'clip_ratio/low_mean': 3.601791678420341e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.124704844343796e-06, 'clip_ratio/high_max': 1.6498819377375185e-05, 'clip_ratio/region_mean': 4.014262168539062e-05, 'epoch': 0.24} + + 26%|██▌ | 262/1024 [11:51:06<36:15:39, 171.31s/it]INFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:16:06 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▌ | 263/1024 [11:53:56<36:07:50, 170.92s/it] + {'loss': 0.0458, 'grad_norm': 0.003405241761356592, 'learning_rate': 1e-05, 'num_tokens': 226102462.0, 'completions/mean_length': 7483.7109375, 'completions/min_length': 1153.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7045.9912109375, 'completions/min_terminated_length': 1153.0, 'completions/max_terminated_length': 15713.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3022220730781555, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021076779812574387, 'sampling/sampling_logp_difference/max': 5.249300479888916, 'sampling/importance_sampling_ratio/min': 0.00525119062513113, 'sampling/importance_sampling_ratio/mean': 1.00002920627594, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9473970532417297, 'clip_ratio/low_mean': 3.766565987461945e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3818944896447647e-06, 'clip_ratio/high_max': 9.527577958579059e-06, 'clip_ratio/region_mean': 4.004755419373396e-05, 'epoch': 0.24} + + 26%|██▌ | 263/1024 [11:53:56<36:07:50, 170.92s/it]INFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:18:56 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▌ | 264/1024 [11:56:45<35:58:50, 170.43s/it] + {'loss': 0.0801, 'grad_norm': 0.0025927501264959574, 'learning_rate': 1e-05, 'num_tokens': 227093562.0, 'completions/mean_length': 7569.03125, 'completions/min_length': 893.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7357.47216796875, 'completions/min_terminated_length': 893.0, 'completions/max_terminated_length': 16256.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.19097033143043518, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020578444004058838, 'sampling/sampling_logp_difference/max': 5.249953269958496, 'sampling/importance_sampling_ratio/min': 0.0052477638237178326, 'sampling/importance_sampling_ratio/mean': 0.9999816417694092, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9231455475091934, 'clip_ratio/low_mean': 3.8259706570897833e-05, 'clip_ratio/low_min': 3.549019083948224e-06, 'clip_ratio/high_mean': 3.966830490753637e-06, 'clip_ratio/high_max': 1.5867321963014547e-05, 'clip_ratio/region_mean': 4.2226537743772496e-05, 'epoch': 0.24} + + 26%|██▌ | 264/1024 [11:56:45<35:58:50, 170.43s/it]INFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:21:45 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▌ | 265/1024 [11:59:44<36:28:05, 172.97s/it] + {'loss': 0.04, 'grad_norm': 0.0030512227676808834, 'learning_rate': 1e-05, 'num_tokens': 228086405.0, 'completions/mean_length': 7589.2734375, 'completions/min_length': 130.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7378.2001953125, 'completions/min_terminated_length': 130.0, 'completions/max_terminated_length': 15819.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.27905434370040894, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020208362489938736, 'sampling/sampling_logp_difference/max': 8.437499046325684, 'sampling/importance_sampling_ratio/min': 0.0002165911573683843, 'sampling/importance_sampling_ratio/mean': 1.000004529953003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9265239909291267, 'clip_ratio/low_mean': 4.253613235505327e-05, 'clip_ratio/low_min': 3.5579084851633525e-06, 'clip_ratio/high_mean': 3.36022765168309e-06, 'clip_ratio/high_max': 1.344091060673236e-05, 'clip_ratio/region_mean': 4.5896360120423196e-05, 'epoch': 0.24} + + 26%|██▌ | 265/1024 [11:59:44<36:28:05, 172.97s/it]INFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:24:44 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▌ | 266/1024 [12:02:55<37:32:23, 178.29s/it] + {'loss': 0.0444, 'grad_norm': 0.0022430522367358208, 'learning_rate': 1e-05, 'num_tokens': 229183765.0, 'completions/mean_length': 8420.6875, 'completions/min_length': 1114.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 8096.97509765625, 'completions/min_terminated_length': 1114.0, 'completions/max_terminated_length': 16275.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.309583842754364, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021570362150669098, 'sampling/sampling_logp_difference/max': 8.121989250183105, 'sampling/importance_sampling_ratio/min': 0.00029693738906644285, 'sampling/importance_sampling_ratio/mean': 0.9999421834945679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9572964608669281, 'clip_ratio/low_mean': 3.184792547017423e-05, 'clip_ratio/low_min': 7.29296516510658e-06, 'clip_ratio/high_mean': 4.903381352505676e-06, 'clip_ratio/high_max': 1.9613525410022703e-05, 'clip_ratio/region_mean': 3.675130722058384e-05, 'epoch': 0.24} + + 26%|██▌ | 266/1024 [12:02:55<37:32:23, 178.29s/it]INFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:27:55 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▌ | 267/1024 [12:05:29<35:57:12, 170.98s/it] + {'loss': 0.1058, 'grad_norm': 0.004295065999031067, 'learning_rate': 1e-05, 'num_tokens': 230077607.0, 'completions/mean_length': 6809.765625, 'completions/min_length': 860.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6579.984375, 'completions/min_terminated_length': 860.0, 'completions/max_terminated_length': 15736.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.20251333713531494, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019895706325769424, 'sampling/sampling_logp_difference/max': 4.886721134185791, 'sampling/importance_sampling_ratio/min': 0.00754612497985363, 'sampling/importance_sampling_ratio/mean': 0.9999294281005859, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.884086549282074, 'clip_ratio/low_mean': 2.1682553096979973e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6821876442918438e-06, 'clip_ratio/high_max': 6.728750577167375e-06, 'clip_ratio/region_mean': 2.336474062758498e-05, 'epoch': 0.25} + + 26%|██▌ | 267/1024 [12:05:29<35:57:12, 170.98s/it]INFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:30:28 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▌ | 268/1024 [12:08:25<36:14:27, 172.58s/it] + {'loss': 0.0096, 'grad_norm': 0.004631794057786465, 'learning_rate': 1e-05, 'num_tokens': 231035616.0, 'completions/mean_length': 7340.6953125, 'completions/min_length': 1616.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6973.0810546875, 'completions/min_terminated_length': 1616.0, 'completions/max_terminated_length': 15080.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3235401213169098, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020591016858816147, 'sampling/sampling_logp_difference/max': 8.290475845336914, 'sampling/importance_sampling_ratio/min': 0.0002508950710762292, 'sampling/importance_sampling_ratio/mean': 0.9999337792396545, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9920620769262314, 'clip_ratio/low_mean': 5.158006410965754e-05, 'clip_ratio/low_min': 5.210069957684027e-06, 'clip_ratio/high_mean': 7.152336877425114e-06, 'clip_ratio/high_max': 2.8609347509700456e-05, 'clip_ratio/region_mean': 5.873240070286556e-05, 'epoch': 0.25} + + 26%|██▌ | 268/1024 [12:08:25<36:14:27, 172.58s/it]INFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:33:25 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▋ | 269/1024 [12:11:07<35:32:52, 169.50s/it] + {'loss': 0.0455, 'grad_norm': 0.0035752104595303535, 'learning_rate': 1e-05, 'num_tokens': 231920056.0, 'completions/mean_length': 6748.875, 'completions/min_length': 1169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6595.93701171875, 'completions/min_terminated_length': 1169.0, 'completions/max_terminated_length': 14120.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02061416581273079, 'sampling/sampling_logp_difference/max': 7.8571391105651855, 'sampling/importance_sampling_ratio/min': 0.0003869794018100947, 'sampling/importance_sampling_ratio/mean': 0.9999653100967407, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9867061004042625, 'clip_ratio/low_mean': 4.3085940774290066e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.3085940774290066e-05, 'epoch': 0.25} + + 26%|██▋ | 269/1024 [12:11:07<35:32:52, 169.50s/it]INFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:36:07 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▋ | 270/1024 [12:13:48<34:54:45, 166.69s/it] + {'loss': 0.0711, 'grad_norm': 0.0036644963547587395, 'learning_rate': 1e-05, 'num_tokens': 232869159.0, 'completions/mean_length': 7260.3046875, 'completions/min_length': 1384.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7188.46435546875, 'completions/min_terminated_length': 1384.0, 'completions/max_terminated_length': 15706.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2359209954738617, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02120530977845192, 'sampling/sampling_logp_difference/max': 7.051599502563477, 'sampling/importance_sampling_ratio/min': 0.0008660226594656706, 'sampling/importance_sampling_ratio/mean': 0.9999546408653259, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0388494208455086, 'clip_ratio/low_mean': 3.10397430212106e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1266876021618373e-06, 'clip_ratio/high_max': 1.2506750408647349e-05, 'clip_ratio/region_mean': 3.416643085074611e-05, 'epoch': 0.25} + + 26%|██▋ | 270/1024 [12:13:48<34:54:45, 166.69s/it]INFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:38:47 [block_pool.py:292] Successfully reset prefix cache + + 26%|██▋ | 271/1024 [12:16:25<34:16:39, 163.88s/it] + {'loss': 0.0039, 'grad_norm': 0.004709267523139715, 'learning_rate': 1e-05, 'num_tokens': 233702842.0, 'completions/mean_length': 6354.4609375, 'completions/min_length': 1035.0, 'completions/max_length': 16073.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6354.4609375, 'completions/min_terminated_length': 1035.0, 'completions/max_terminated_length': 16073.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.3214184641838074, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019126038998365402, 'sampling/sampling_logp_difference/max': 5.37499475479126, 'sampling/importance_sampling_ratio/min': 0.0046309432946145535, 'sampling/importance_sampling_ratio/mean': 0.9999738931655884, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8405331820249557, 'clip_ratio/low_mean': 3.1861192269388994e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.760888254575548e-06, 'clip_ratio/high_max': 2.704355301830219e-05, 'clip_ratio/region_mean': 3.862208097871189e-05, 'epoch': 0.25} + + 26%|██▋ | 271/1024 [12:16:25<34:16:39, 163.88s/it]INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:41:25 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 272/1024 [12:19:12<34:27:03, 164.93s/it] + {'loss': 0.0757, 'grad_norm': 0.003066045930609107, 'learning_rate': 1e-05, 'num_tokens': 234556348.0, 'completions/mean_length': 6514.578125, 'completions/min_length': 982.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6357.9208984375, 'completions/min_terminated_length': 982.0, 'completions/max_terminated_length': 16026.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.31246688961982727, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019960148259997368, 'sampling/sampling_logp_difference/max': 5.257136344909668, 'sampling/importance_sampling_ratio/min': 0.005210204049944878, 'sampling/importance_sampling_ratio/mean': 0.9999805092811584, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0254098922014236, 'clip_ratio/low_mean': 3.855073941849696e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.437307159652846e-06, 'clip_ratio/high_max': 9.749228638611385e-06, 'clip_ratio/region_mean': 4.098804652130639e-05, 'epoch': 0.25} + + 27%|██▋ | 272/1024 [12:19:12<34:27:03, 164.93s/it]INFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:44:12 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 273/1024 [12:22:06<34:58:24, 167.65s/it] + {'loss': 0.062, 'grad_norm': 0.005132520105689764, 'learning_rate': 1e-05, 'num_tokens': 235521091.0, 'completions/mean_length': 7379.5546875, 'completions/min_length': 701.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7236.62744140625, 'completions/min_terminated_length': 701.0, 'completions/max_terminated_length': 15894.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2301519364118576, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021417103707790375, 'sampling/sampling_logp_difference/max': 8.699974060058594, 'sampling/importance_sampling_ratio/min': 0.00016659013635944575, 'sampling/importance_sampling_ratio/mean': 0.9999256134033203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0397320613265038, 'clip_ratio/low_mean': 3.487835761006863e-05, 'clip_ratio/low_min': 2.9392399483185727e-06, 'clip_ratio/high_mean': 2.6189534310105955e-06, 'clip_ratio/high_max': 1.0475813724042382e-05, 'clip_ratio/region_mean': 3.749731081370555e-05, 'epoch': 0.25} + + 27%|██▋ | 273/1024 [12:22:06<34:58:24, 167.65s/it]INFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:47:06 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 274/1024 [12:24:58<35:11:27, 168.92s/it] + {'loss': 0.0143, 'grad_norm': 0.0028969801496714354, 'learning_rate': 1e-05, 'num_tokens': 236544160.0, 'completions/mean_length': 7837.1640625, 'completions/min_length': 1346.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7632.04052734375, 'completions/min_terminated_length': 1346.0, 'completions/max_terminated_length': 14565.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.29378965497016907, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019267702475190163, 'sampling/sampling_logp_difference/max': 15.059157371520996, 'sampling/importance_sampling_ratio/min': 2.883308241052873e-07, 'sampling/importance_sampling_ratio/mean': 0.9999887943267822, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8400963917374611, 'clip_ratio/low_mean': 2.6659268655748747e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.776861314643611e-06, 'clip_ratio/high_max': 1.9904123973901733e-05, 'clip_ratio/region_mean': 3.2436129686175263e-05, 'epoch': 0.25} + + 27%|██▋ | 274/1024 [12:24:58<35:11:27, 168.92s/it]INFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:49:58 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 275/1024 [12:27:26<33:48:40, 162.51s/it] + {'loss': 0.0803, 'grad_norm': 0.003412836929783225, 'learning_rate': 1e-05, 'num_tokens': 237423101.0, 'completions/mean_length': 6696.3515625, 'completions/min_length': 1239.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6542.57958984375, 'completions/min_terminated_length': 1239.0, 'completions/max_terminated_length': 15350.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.37981897592544556, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018458625301718712, 'sampling/sampling_logp_difference/max': 4.410195827484131, 'sampling/importance_sampling_ratio/min': 0.012152798473834991, 'sampling/importance_sampling_ratio/mean': 1.0000269412994385, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8495818004012108, 'clip_ratio/low_mean': 4.060094340729847e-05, 'clip_ratio/low_min': 3.8700886761944275e-06, 'clip_ratio/high_mean': 2.1406925725386827e-06, 'clip_ratio/high_max': 8.562770290154731e-06, 'clip_ratio/region_mean': 4.2741635979837156e-05, 'epoch': 0.25} + + 27%|██▋ | 275/1024 [12:27:26<33:48:40, 162.51s/it]INFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:52:25 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 276/1024 [12:30:41<35:48:44, 172.36s/it] + {'loss': 0.0604, 'grad_norm': 0.0024443145375698805, 'learning_rate': 1e-05, 'num_tokens': 238429956.0, 'completions/mean_length': 7700.3671875, 'completions/min_length': 844.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7121.45849609375, 'completions/min_terminated_length': 844.0, 'completions/max_terminated_length': 15666.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2872493863105774, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019427984952926636, 'sampling/sampling_logp_difference/max': 8.250510215759277, 'sampling/importance_sampling_ratio/min': 0.00026112530031241477, 'sampling/importance_sampling_ratio/mean': 0.9999113082885742, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8258870914578438, 'clip_ratio/low_mean': 6.144847083078275e-05, 'clip_ratio/low_min': 1.110105540647055e-05, 'clip_ratio/high_mean': 3.646129641765583e-06, 'clip_ratio/high_max': 1.1463653436294408e-05, 'clip_ratio/region_mean': 6.509460160941671e-05, 'epoch': 0.25} + + 27%|██▋ | 276/1024 [12:30:41<35:48:44, 172.36s/it]INFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:55:41 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 277/1024 [12:33:18<34:48:07, 167.72s/it] + {'loss': 0.0222, 'grad_norm': 0.0022747826296836138, 'learning_rate': 1e-05, 'num_tokens': 239250160.0, 'completions/mean_length': 6255.21875, 'completions/min_length': 793.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6094.44482421875, 'completions/min_terminated_length': 793.0, 'completions/max_terminated_length': 16112.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018723051995038986, 'sampling/sampling_logp_difference/max': 8.241846084594727, 'sampling/importance_sampling_ratio/min': 0.0002633975527714938, 'sampling/importance_sampling_ratio/mean': 0.9999998807907104, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8179014846682549, 'clip_ratio/low_mean': 1.7289162997258245e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0545319355514948e-06, 'clip_ratio/high_max': 4.218127742205979e-06, 'clip_ratio/region_mean': 1.834369493280974e-05, 'epoch': 0.25} + + 27%|██▋ | 277/1024 [12:33:18<34:48:07, 167.72s/it]INFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:58:18 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 278/1024 [12:35:50<33:46:12, 162.97s/it] + {'loss': -0.0056, 'grad_norm': 0.005685295443981886, 'learning_rate': 1e-05, 'num_tokens': 240156211.0, 'completions/mean_length': 6914.9609375, 'completions/min_length': 730.0, 'completions/max_length': 15321.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6914.9609375, 'completions/min_terminated_length': 730.0, 'completions/max_terminated_length': 15321.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021195171400904655, 'sampling/sampling_logp_difference/max': 9.997581481933594, 'sampling/importance_sampling_ratio/min': 4.5509867049986497e-05, 'sampling/importance_sampling_ratio/mean': 0.9998887777328491, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9700981751084328, 'clip_ratio/low_mean': 6.14647315160255e-05, 'clip_ratio/low_min': 5.043576493335422e-06, 'clip_ratio/high_mean': 5.369374321162468e-06, 'clip_ratio/high_max': 1.698448841125355e-05, 'clip_ratio/region_mean': 6.683410583718796e-05, 'epoch': 0.26} + + 27%|██▋ | 278/1024 [12:35:50<33:46:12, 162.97s/it]INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:00:49 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 279/1024 [12:38:12<32:27:38, 156.86s/it] + {'loss': 0.1246, 'grad_norm': 0.003880272386595607, 'learning_rate': 1e-05, 'num_tokens': 240845295.0, 'completions/mean_length': 5227.53125, 'completions/min_length': 647.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5139.68505859375, 'completions/min_terminated_length': 647.0, 'completions/max_terminated_length': 15469.0, 'rewards/accuracy_reward/mean': 0.6328125, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.6328125, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018801718950271606, 'sampling/sampling_logp_difference/max': 8.993386268615723, 'sampling/importance_sampling_ratio/min': 0.00012422871077433228, 'sampling/importance_sampling_ratio/mean': 1.0000362396240234, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9116031974554062, 'clip_ratio/low_mean': 2.9186837764427764e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.9186837764427764e-05, 'epoch': 0.26} + + 27%|██▋ | 279/1024 [12:38:12<32:27:38, 156.86s/it]INFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:03:12 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 280/1024 [12:41:08<33:34:54, 162.49s/it] + {'loss': 0.0947, 'grad_norm': 0.0028986844699829817, 'learning_rate': 1e-05, 'num_tokens': 241895676.0, 'completions/mean_length': 8065.4765625, 'completions/min_length': 1055.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7510.90869140625, 'completions/min_terminated_length': 1055.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3474721610546112, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01853121444582939, 'sampling/sampling_logp_difference/max': 6.3748297691345215, 'sampling/importance_sampling_ratio/min': 0.0017039099475368857, 'sampling/importance_sampling_ratio/mean': 0.9999842643737793, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7446574792265892, 'clip_ratio/low_mean': 5.524710468307603e-05, 'clip_ratio/low_min': 3.776891389861703e-06, 'clip_ratio/high_mean': 8.084949570275057e-06, 'clip_ratio/high_max': 2.5015486926349695e-05, 'clip_ratio/region_mean': 6.333205465125502e-05, 'epoch': 0.26} + + 27%|██▋ | 280/1024 [12:41:08<33:34:54, 162.49s/it]INFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:06:08 [block_pool.py:292] Successfully reset prefix cache + + 27%|██▋ | 281/1024 [12:43:37<32:40:28, 158.32s/it] + {'loss': 0.0368, 'grad_norm': 0.003845847910270095, 'learning_rate': 1e-05, 'num_tokens': 242698258.0, 'completions/mean_length': 6127.359375, 'completions/min_length': 848.0, 'completions/max_length': 15534.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6127.359375, 'completions/min_terminated_length': 848.0, 'completions/max_terminated_length': 15534.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01856958493590355, 'sampling/sampling_logp_difference/max': 7.746356964111328, 'sampling/importance_sampling_ratio/min': 0.00043231461313553154, 'sampling/importance_sampling_ratio/mean': 1.0000942945480347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8569132760167122, 'clip_ratio/low_mean': 2.896106741445692e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.371585253513331e-06, 'clip_ratio/high_max': 9.486341014053323e-06, 'clip_ratio/region_mean': 3.133265261112683e-05, 'epoch': 0.26} + + 27%|██▋ | 281/1024 [12:43:37<32:40:28, 158.32s/it]INFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:08:36 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 282/1024 [12:46:22<33:04:18, 160.46s/it] + {'loss': 0.0666, 'grad_norm': 0.003953634761273861, 'learning_rate': 1e-05, 'num_tokens': 243560957.0, 'completions/mean_length': 6600.1484375, 'completions/min_length': 1252.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6365.33642578125, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 15192.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018097909167408943, 'sampling/sampling_logp_difference/max': 7.334624767303467, 'sampling/importance_sampling_ratio/min': 0.0006525487406179309, 'sampling/importance_sampling_ratio/mean': 0.9999537467956543, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.78924310952425, 'clip_ratio/low_mean': 4.3558867673709756e-05, 'clip_ratio/low_min': 4.417741820361698e-06, 'clip_ratio/high_mean': 7.4620825216697995e-06, 'clip_ratio/high_max': 2.9848330086679198e-05, 'clip_ratio/region_mean': 5.1020949285884853e-05, 'epoch': 0.26} + + 28%|██▊ | 282/1024 [12:46:22<33:04:18, 160.46s/it]INFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:11:22 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 283/1024 [12:49:12<33:35:54, 163.23s/it] + {'loss': 0.0265, 'grad_norm': 0.00360781978815794, 'learning_rate': 1e-05, 'num_tokens': 244585923.0, 'completions/mean_length': 7852.171875, 'completions/min_length': 1276.0, 'completions/max_length': 15755.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7852.171875, 'completions/min_terminated_length': 1276.0, 'completions/max_terminated_length': 15755.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.19438527524471283, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022330068051815033, 'sampling/sampling_logp_difference/max': 10.076086044311523, 'sampling/importance_sampling_ratio/min': 4.2073770600836724e-05, 'sampling/importance_sampling_ratio/mean': 0.9999812841415405, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0598893761634827, 'clip_ratio/low_mean': 2.737523408313791e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6588904259151604e-06, 'clip_ratio/high_max': 6.635561703660642e-06, 'clip_ratio/region_mean': 2.9034124281679397e-05, 'epoch': 0.26} + + 28%|██▊ | 283/1024 [12:49:12<33:35:54, 163.23s/it]INFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:14:11 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 284/1024 [12:52:20<35:05:31, 170.72s/it] + {'loss': 0.0587, 'grad_norm': 0.0027661293279379606, 'learning_rate': 1e-05, 'num_tokens': 245628064.0, 'completions/mean_length': 7972.2265625, 'completions/min_length': 610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7700.87890625, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.1872510462999344, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021125148981809616, 'sampling/sampling_logp_difference/max': 10.366576194763184, 'sampling/importance_sampling_ratio/min': 3.1466843211092055e-05, 'sampling/importance_sampling_ratio/mean': 0.9999428987503052, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.933217465877533, 'clip_ratio/low_mean': 4.7973388973332476e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.885042236921436e-07, 'clip_ratio/high_max': 3.1540168947685743e-06, 'clip_ratio/region_mean': 4.876189268543385e-05, 'epoch': 0.26} + + 28%|██▊ | 284/1024 [12:52:20<35:05:31, 170.72s/it]INFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:17:20 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 285/1024 [12:55:06<34:44:04, 169.21s/it] + {'loss': 0.0786, 'grad_norm': 0.005680318456143141, 'learning_rate': 1e-05, 'num_tokens': 246561329.0, 'completions/mean_length': 7135.6953125, 'completions/min_length': 640.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6913.736328125, 'completions/min_terminated_length': 640.0, 'completions/max_terminated_length': 15744.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3077537715435028, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018504241481423378, 'sampling/sampling_logp_difference/max': 9.737424850463867, 'sampling/importance_sampling_ratio/min': 5.9032357967225835e-05, 'sampling/importance_sampling_ratio/mean': 0.9999462366104126, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7786942347884178, 'clip_ratio/low_mean': 4.6317693090713874e-05, 'clip_ratio/low_min': 3.820877282123547e-06, 'clip_ratio/high_mean': 3.241492265715351e-06, 'clip_ratio/high_max': 1.2965969062861404e-05, 'clip_ratio/region_mean': 4.955918507221213e-05, 'epoch': 0.26} + + 28%|██▊ | 285/1024 [12:55:06<34:44:04, 169.21s/it]INFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:20:05 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 286/1024 [12:57:53<34:35:02, 168.70s/it] + {'loss': 0.1072, 'grad_norm': 0.0026402862276881933, 'learning_rate': 1e-05, 'num_tokens': 247437415.0, 'completions/mean_length': 6704.046875, 'completions/min_length': 155.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6627.82666015625, 'completions/min_terminated_length': 155.0, 'completions/max_terminated_length': 16161.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.31276631355285645, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02134273201227188, 'sampling/sampling_logp_difference/max': 7.156195640563965, 'sampling/importance_sampling_ratio/min': 0.0007800163584761322, 'sampling/importance_sampling_ratio/mean': 0.9998904466629028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0435140281915665, 'clip_ratio/low_mean': 4.22437145175536e-05, 'clip_ratio/low_min': 1.4025082009538892e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.22437145175536e-05, 'epoch': 0.26} + + 28%|██▊ | 286/1024 [12:57:53<34:35:02, 168.70s/it]INFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:22:53 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 287/1024 [13:00:28<33:41:57, 164.61s/it] + {'loss': 0.0565, 'grad_norm': 0.003993614576756954, 'learning_rate': 1e-05, 'num_tokens': 248211112.0, 'completions/mean_length': 5892.5078125, 'completions/min_length': 249.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5725.9765625, 'completions/min_terminated_length': 249.0, 'completions/max_terminated_length': 15708.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3322049677371979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01924925297498703, 'sampling/sampling_logp_difference/max': 6.005458354949951, 'sampling/importance_sampling_ratio/min': 0.0024652592837810516, 'sampling/importance_sampling_ratio/mean': 1.0000004768371582, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8004944771528244, 'clip_ratio/low_mean': 4.7084630978133646e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.8746438159905665e-06, 'clip_ratio/high_max': 2.223430897174694e-05, 'clip_ratio/region_mean': 5.3959275192028144e-05, 'epoch': 0.26} + + 28%|██▊ | 287/1024 [13:00:28<33:41:57, 164.61s/it]INFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:25:28 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 288/1024 [13:03:43<35:31:01, 173.72s/it] + {'loss': 0.0823, 'grad_norm': 0.001573400106281042, 'learning_rate': 1e-05, 'num_tokens': 249228106.0, 'completions/mean_length': 7812.140625, 'completions/min_length': 1515.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7316.24755859375, 'completions/min_terminated_length': 1515.0, 'completions/max_terminated_length': 15892.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2767002284526825, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01956877112388611, 'sampling/sampling_logp_difference/max': 6.906228542327881, 'sampling/importance_sampling_ratio/min': 0.001001527882181108, 'sampling/importance_sampling_ratio/mean': 0.9998818635940552, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8841542899608612, 'clip_ratio/low_mean': 3.415995615796419e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.458270550207089e-06, 'clip_ratio/high_max': 2.1833082200828358e-05, 'clip_ratio/region_mean': 3.961822596920683e-05, 'epoch': 0.26} + + 28%|██▊ | 288/1024 [13:03:43<35:31:01, 173.72s/it]INFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:28:43 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 289/1024 [13:06:33<35:13:31, 172.53s/it] + {'loss': 0.0438, 'grad_norm': 0.0021125099156051874, 'learning_rate': 1e-05, 'num_tokens': 250063284.0, 'completions/mean_length': 6372.953125, 'completions/min_length': 686.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6132.6884765625, 'completions/min_terminated_length': 686.0, 'completions/max_terminated_length': 16250.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01943521574139595, 'sampling/sampling_logp_difference/max': 9.937475204467773, 'sampling/importance_sampling_ratio/min': 4.8329173296224326e-05, 'sampling/importance_sampling_ratio/mean': 0.9999308586120605, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8228401988744736, 'clip_ratio/low_mean': 3.068193461785995e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.53609755418438e-06, 'clip_ratio/high_max': 1.014439021673752e-05, 'clip_ratio/region_mean': 3.321803217204433e-05, 'epoch': 0.27} + + 28%|██▊ | 289/1024 [13:06:33<35:13:31, 172.53s/it]INFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:31:33 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 290/1024 [13:09:49<36:37:07, 179.60s/it] + {'loss': 0.0565, 'grad_norm': 0.0022315154783427715, 'learning_rate': 1e-05, 'num_tokens': 251085123.0, 'completions/mean_length': 7817.8671875, 'completions/min_length': 1568.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7396.58154296875, 'completions/min_terminated_length': 1568.0, 'completions/max_terminated_length': 16270.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021764669567346573, 'sampling/sampling_logp_difference/max': 12.760490417480469, 'sampling/importance_sampling_ratio/min': 2.8720330647047376e-06, 'sampling/importance_sampling_ratio/mean': 0.99993896484375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9454319775104523, 'clip_ratio/low_mean': 2.526416994896863e-05, 'clip_ratio/low_min': 6.7760895490209805e-06, 'clip_ratio/high_mean': 1.7559765410624095e-06, 'clip_ratio/high_max': 7.023906164249638e-06, 'clip_ratio/region_mean': 2.7020146660561295e-05, 'epoch': 0.27} + + 28%|██▊ | 290/1024 [13:09:49<36:37:07, 179.60s/it]INFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:34:49 [block_pool.py:292] Successfully reset prefix cache + + 28%|██▊ | 291/1024 [13:12:50<36:38:48, 179.98s/it] + {'loss': 0.0808, 'grad_norm': 0.004663965664803982, 'learning_rate': 1e-05, 'num_tokens': 252020906.0, 'completions/mean_length': 7168.4921875, 'completions/min_length': 817.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6635.36328125, 'completions/min_terminated_length': 817.0, 'completions/max_terminated_length': 16352.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2927239239215851, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01929781585931778, 'sampling/sampling_logp_difference/max': 7.861782550811768, 'sampling/importance_sampling_ratio/min': 0.0003851866349577904, 'sampling/importance_sampling_ratio/mean': 0.9999589920043945, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8433891162276268, 'clip_ratio/low_mean': 4.36271948274225e-05, 'clip_ratio/low_min': 3.6957101201551268e-06, 'clip_ratio/high_mean': 3.699491571751423e-06, 'clip_ratio/high_max': 1.4797966287005693e-05, 'clip_ratio/region_mean': 4.732668639917392e-05, 'epoch': 0.27} + + 28%|██▊ | 291/1024 [13:12:50<36:38:48, 179.98s/it]INFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:37:50 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▊ | 292/1024 [13:15:34<35:38:33, 175.29s/it] + {'loss': 0.079, 'grad_norm': 0.0036942458245903254, 'learning_rate': 1e-05, 'num_tokens': 252977435.0, 'completions/mean_length': 7322.5078125, 'completions/min_length': 1196.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6876.8603515625, 'completions/min_terminated_length': 1196.0, 'completions/max_terminated_length': 16301.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.24275577068328857, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0205365102738142, 'sampling/sampling_logp_difference/max': 8.124969482421875, 'sampling/importance_sampling_ratio/min': 0.00029605376766994596, 'sampling/importance_sampling_ratio/mean': 0.9999804496765137, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9157031401991844, 'clip_ratio/low_mean': 4.2792244585143635e-05, 'clip_ratio/low_min': 1.0337215371691855e-05, 'clip_ratio/high_mean': 6.089093403716106e-06, 'clip_ratio/high_max': 1.996871560550062e-05, 'clip_ratio/region_mean': 4.8881338216233416e-05, 'epoch': 0.27} + + 29%|██▊ | 292/1024 [13:15:34<35:38:33, 175.29s/it]INFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:40:34 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▊ | 293/1024 [13:18:20<35:01:00, 172.45s/it] + {'loss': 0.0227, 'grad_norm': 0.0034127074759453535, 'learning_rate': 1e-05, 'num_tokens': 253896161.0, 'completions/mean_length': 7025.484375, 'completions/min_length': 337.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6723.5966796875, 'completions/min_terminated_length': 337.0, 'completions/max_terminated_length': 16078.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.27722424268722534, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.023741140961647034, 'sampling/sampling_logp_difference/max': 7.562129497528076, 'sampling/importance_sampling_ratio/min': 0.0005197672289796174, 'sampling/importance_sampling_ratio/mean': 0.9999400973320007, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1329731941223145, 'clip_ratio/low_mean': 2.631919460327481e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.631919460327481e-05, 'epoch': 0.27} + + 29%|██▊ | 293/1024 [13:18:20<35:01:00, 172.45s/it]INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:43:20 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▊ | 294/1024 [13:21:07<34:36:46, 170.69s/it] + {'loss': 0.0156, 'grad_norm': 0.003959407564252615, 'learning_rate': 1e-05, 'num_tokens': 254690264.0, 'completions/mean_length': 5996.1796875, 'completions/min_length': 882.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5661.08837890625, 'completions/min_terminated_length': 882.0, 'completions/max_terminated_length': 13776.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.26645541191101074, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018407585099339485, 'sampling/sampling_logp_difference/max': 15.73043155670166, 'sampling/importance_sampling_ratio/min': 1.4735347519945208e-07, 'sampling/importance_sampling_ratio/mean': 0.9999563694000244, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8773328885436058, 'clip_ratio/low_mean': 2.4661783299961826e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.092160914595297e-06, 'clip_ratio/high_max': 4.368643658381188e-06, 'clip_ratio/region_mean': 2.5753944555617636e-05, 'epoch': 0.27} + + 29%|██▊ | 294/1024 [13:21:07<34:36:46, 170.69s/it]INFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:46:06 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 295/1024 [13:23:49<34:02:37, 168.12s/it] + {'loss': 0.0541, 'grad_norm': 0.0030910037457942963, 'learning_rate': 1e-05, 'num_tokens': 255626394.0, 'completions/mean_length': 7165.328125, 'completions/min_length': 1115.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6867.951171875, 'completions/min_terminated_length': 1115.0, 'completions/max_terminated_length': 16383.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020668907091021538, 'sampling/sampling_logp_difference/max': 8.407832145690918, 'sampling/importance_sampling_ratio/min': 0.00022311302018351853, 'sampling/importance_sampling_ratio/mean': 1.0000731945037842, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9502597972750664, 'clip_ratio/low_mean': 3.736187466074625e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.041209194838302e-06, 'clip_ratio/high_max': 1.616483677935321e-05, 'clip_ratio/region_mean': 4.140308453770558e-05, 'epoch': 0.27} + + 29%|██▉ | 295/1024 [13:23:49<34:02:37, 168.12s/it]INFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:48:48 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 296/1024 [13:26:45<34:29:10, 170.54s/it] + {'loss': 0.0463, 'grad_norm': 0.0037233952898532152, 'learning_rate': 1e-05, 'num_tokens': 256673457.0, 'completions/mean_length': 8001.9296875, 'completions/min_length': 164.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7661.34912109375, 'completions/min_terminated_length': 164.0, 'completions/max_terminated_length': 15375.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020540472120046616, 'sampling/sampling_logp_difference/max': 6.124904632568359, 'sampling/importance_sampling_ratio/min': 0.0021876997780054808, 'sampling/importance_sampling_ratio/mean': 0.9999151229858398, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8591345250606537, 'clip_ratio/low_mean': 5.5247357522603124e-05, 'clip_ratio/low_min': 3.6811261452385224e-06, 'clip_ratio/high_mean': 2.9256716516101733e-06, 'clip_ratio/high_max': 1.1702686606440693e-05, 'clip_ratio/region_mean': 5.8173028264718596e-05, 'epoch': 0.27} + + 29%|██▉ | 296/1024 [13:26:45<34:29:10, 170.54s/it]INFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:51:45 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 297/1024 [13:29:27<33:55:33, 168.00s/it] + {'loss': 0.0669, 'grad_norm': 0.006054217461496592, 'learning_rate': 1e-05, 'num_tokens': 257578501.0, 'completions/mean_length': 6924.84375, 'completions/min_length': 803.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6697.82421875, 'completions/min_terminated_length': 803.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2927239239215851, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019306108355522156, 'sampling/sampling_logp_difference/max': 4.842195510864258, 'sampling/importance_sampling_ratio/min': 0.007889713160693645, 'sampling/importance_sampling_ratio/mean': 0.9999213218688965, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7969356626272202, 'clip_ratio/low_mean': 3.570647322703735e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2759249216287571e-05, 'clip_ratio/high_max': 3.721341136042611e-05, 'clip_ratio/region_mean': 4.846572301175911e-05, 'epoch': 0.27} + + 29%|██▉ | 297/1024 [13:29:27<33:55:33, 168.00s/it]INFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:54:27 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 298/1024 [13:31:59<32:55:41, 163.28s/it] + {'loss': 0.0689, 'grad_norm': 0.004903806839138269, 'learning_rate': 1e-05, 'num_tokens': 258392625.0, 'completions/mean_length': 6203.03125, 'completions/min_length': 180.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5958.6884765625, 'completions/min_terminated_length': 180.0, 'completions/max_terminated_length': 14439.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01909301057457924, 'sampling/sampling_logp_difference/max': 8.498823165893555, 'sampling/importance_sampling_ratio/min': 0.00020370795391499996, 'sampling/importance_sampling_ratio/mean': 0.9999826550483704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8734413683414459, 'clip_ratio/low_mean': 5.2388056587915344e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5528859737278253e-06, 'clip_ratio/high_max': 1.0211543894911301e-05, 'clip_ratio/region_mean': 5.4940942732173426e-05, 'epoch': 0.27} + + 29%|██▉ | 298/1024 [13:31:59<32:55:41, 163.28s/it]INFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:56:59 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 299/1024 [13:34:57<33:44:39, 167.56s/it] + {'loss': 0.0625, 'grad_norm': 0.0033637424930930138, 'learning_rate': 1e-05, 'num_tokens': 259435270.0, 'completions/mean_length': 7982.5390625, 'completions/min_length': 776.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7641.01611328125, 'completions/min_terminated_length': 776.0, 'completions/max_terminated_length': 15554.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.31246691942214966, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02182736061513424, 'sampling/sampling_logp_difference/max': 6.406092166900635, 'sampling/importance_sampling_ratio/min': 0.0016514655435457826, 'sampling/importance_sampling_ratio/mean': 0.9999765753746033, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0091779381036758, 'clip_ratio/low_mean': 4.373456977191381e-05, 'clip_ratio/low_min': 3.670856358439778e-06, 'clip_ratio/high_mean': 4.64845766146027e-06, 'clip_ratio/high_max': 1.5135058674786706e-05, 'clip_ratio/region_mean': 4.8383026296505705e-05, 'epoch': 0.28} + + 29%|██▉ | 299/1024 [13:34:57<33:44:39, 167.56s/it]INFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 02:59:57 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 300/1024 [13:37:41<33:29:23, 166.52s/it] + {'loss': 0.144, 'grad_norm': 0.0052203768864274025, 'learning_rate': 1e-05, 'num_tokens': 260337614.0, 'completions/mean_length': 6915.3125, 'completions/min_length': 778.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6688.064453125, 'completions/min_terminated_length': 778.0, 'completions/max_terminated_length': 16265.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.37928223609924316, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017896221950650215, 'sampling/sampling_logp_difference/max': 9.562335968017578, 'sampling/importance_sampling_ratio/min': 7.032832218101248e-05, 'sampling/importance_sampling_ratio/mean': 0.9999016523361206, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7964543774724007, 'clip_ratio/low_mean': 5.2442986770984135e-05, 'clip_ratio/low_min': 8.75736759553547e-06, 'clip_ratio/high_mean': 5.991175669350923e-06, 'clip_ratio/high_max': 2.3964702677403693e-05, 'clip_ratio/region_mean': 5.843416238349164e-05, 'epoch': 0.28} + + 29%|██▉ | 300/1024 [13:37:41<33:29:23, 166.52s/it]INFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:02:41 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 301/1024 [13:40:24<33:14:21, 165.51s/it] + {'loss': 0.0984, 'grad_norm': 0.005570738110691309, 'learning_rate': 1e-05, 'num_tokens': 261254070.0, 'completions/mean_length': 7029.4375, 'completions/min_length': 679.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6880.95263671875, 'completions/min_terminated_length': 679.0, 'completions/max_terminated_length': 16198.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3327290117740631, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01905740052461624, 'sampling/sampling_logp_difference/max': 7.005340576171875, 'sampling/importance_sampling_ratio/min': 0.0009070249507203698, 'sampling/importance_sampling_ratio/mean': 0.9999494552612305, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8605096861720085, 'clip_ratio/low_mean': 6.243192206056847e-05, 'clip_ratio/low_min': 1.2397775662975619e-05, 'clip_ratio/high_mean': 1.1145679081892013e-05, 'clip_ratio/high_max': 4.458271632756805e-05, 'clip_ratio/region_mean': 7.357759886872373e-05, 'epoch': 0.28} + + 29%|██▉ | 301/1024 [13:40:24<33:14:21, 165.51s/it]INFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:05:24 [block_pool.py:292] Successfully reset prefix cache + + 29%|██▉ | 302/1024 [13:43:06<32:59:33, 164.51s/it] + {'loss': 0.062, 'grad_norm': 0.004496110137552023, 'learning_rate': 1e-05, 'num_tokens': 262024906.0, 'completions/mean_length': 5858.84375, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5606.240234375, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 15987.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019225869327783585, 'sampling/sampling_logp_difference/max': 7.812377452850342, 'sampling/importance_sampling_ratio/min': 0.00040469475788995624, 'sampling/importance_sampling_ratio/mean': 0.9999294877052307, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8430554121732712, 'clip_ratio/low_mean': 7.46641262594494e-05, 'clip_ratio/low_min': 5.041745680500753e-06, 'clip_ratio/high_mean': 1.1191766247975465e-05, 'clip_ratio/high_max': 3.390461233720998e-05, 'clip_ratio/region_mean': 8.585589102949598e-05, 'epoch': 0.28} + + 29%|██▉ | 302/1024 [13:43:06<32:59:33, 164.51s/it]INFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:08:06 [block_pool.py:292] Successfully reset prefix cache + + 30%|██▉ | 303/1024 [13:46:10<34:07:24, 170.38s/it] + {'loss': -0.0002, 'grad_norm': 0.00348713924176991, 'learning_rate': 1e-05, 'num_tokens': 263110844.0, 'completions/mean_length': 8337.328125, 'completions/min_length': 837.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7728.7568359375, 'completions/min_terminated_length': 837.0, 'completions/max_terminated_length': 15976.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.20805485546588898, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02157524600625038, 'sampling/sampling_logp_difference/max': 6.090071678161621, 'sampling/importance_sampling_ratio/min': 0.0022652465850114822, 'sampling/importance_sampling_ratio/mean': 0.9998900890350342, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.901745393872261, 'clip_ratio/low_mean': 3.7080020149460324e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.140988825289242e-07, 'clip_ratio/high_max': 3.2563955301156966e-06, 'clip_ratio/region_mean': 3.789411886145899e-05, 'epoch': 0.28} + + 30%|██▉ | 303/1024 [13:46:10<34:07:24, 170.38s/it]INFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:11:10 [block_pool.py:292] Successfully reset prefix cache + + 30%|██▉ | 304/1024 [13:49:10<34:36:46, 173.06s/it] + {'loss': 0.0551, 'grad_norm': 0.003980033565312624, 'learning_rate': 1e-05, 'num_tokens': 264036169.0, 'completions/mean_length': 7084.7265625, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6381.42041015625, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 16315.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.27434611320495605, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018782664090394974, 'sampling/sampling_logp_difference/max': 8.999666213989258, 'sampling/importance_sampling_ratio/min': 0.00012345099821686745, 'sampling/importance_sampling_ratio/mean': 0.9999673366546631, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8265534415841103, 'clip_ratio/low_mean': 2.823553325015382e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.934936211815511e-06, 'clip_ratio/high_max': 2.3739744847262045e-05, 'clip_ratio/region_mean': 3.417046866616147e-05, 'epoch': 0.28} + + 30%|██▉ | 304/1024 [13:49:10<34:36:46, 173.06s/it]INFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:14:09 [block_pool.py:292] Successfully reset prefix cache + + 30%|██▉ | 305/1024 [13:51:51<33:52:34, 169.62s/it] + {'loss': 0.1139, 'grad_norm': 0.006467343773692846, 'learning_rate': 1e-05, 'num_tokens': 264892767.0, 'completions/mean_length': 6543.796875, 'completions/min_length': 93.0, 'completions/max_length': 16292.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6543.796875, 'completions/min_terminated_length': 93.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3934885561466217, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.02032080665230751, 'sampling/sampling_logp_difference/max': 9.221251487731934, 'sampling/importance_sampling_ratio/min': 9.891482477542013e-05, 'sampling/importance_sampling_ratio/mean': 1.0000489950180054, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8899869695305824, 'clip_ratio/low_mean': 6.913120819262986e-05, 'clip_ratio/low_min': 2.494283216947224e-05, 'clip_ratio/high_mean': 3.771558226617344e-06, 'clip_ratio/high_max': 1.1745505617000163e-05, 'clip_ratio/region_mean': 7.290276607818669e-05, 'epoch': 0.28} + + 30%|██▉ | 305/1024 [13:51:51<33:52:34, 169.62s/it]INFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:16:51 [block_pool.py:292] Successfully reset prefix cache + + 30%|██▉ | 306/1024 [13:54:58<34:50:44, 174.71s/it] + {'loss': 0.0948, 'grad_norm': 0.003174177836626768, 'learning_rate': 1e-05, 'num_tokens': 265995697.0, 'completions/mean_length': 8483.390625, 'completions/min_length': 1342.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7813.84765625, 'completions/min_terminated_length': 1342.0, 'completions/max_terminated_length': 16307.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02145479805767536, 'sampling/sampling_logp_difference/max': 7.4824934005737305, 'sampling/importance_sampling_ratio/min': 0.0005628522485494614, 'sampling/importance_sampling_ratio/mean': 1.0000269412994385, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9621479511260986, 'clip_ratio/low_mean': 4.395576979732141e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.395576979732141e-05, 'epoch': 0.28} + + 30%|██▉ | 306/1024 [13:54:58<34:50:44, 174.71s/it]INFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:19:58 [block_pool.py:292] Successfully reset prefix cache + + 30%|██▉ | 307/1024 [13:57:43<34:13:09, 171.81s/it] + {'loss': 0.0887, 'grad_norm': 0.003356153378263116, 'learning_rate': 1e-05, 'num_tokens': 266937707.0, 'completions/mean_length': 7184.578125, 'completions/min_length': 419.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6963.79248046875, 'completions/min_terminated_length': 419.0, 'completions/max_terminated_length': 14985.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.25566399097442627, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02204768732190132, 'sampling/sampling_logp_difference/max': 6.374974727630615, 'sampling/importance_sampling_ratio/min': 0.0017036627978086472, 'sampling/importance_sampling_ratio/mean': 1.0000238418579102, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9993807673454285, 'clip_ratio/low_mean': 3.7911659774181317e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.149131202917488e-06, 'clip_ratio/high_max': 1.2596524811669951e-05, 'clip_ratio/region_mean': 4.106079018129094e-05, 'epoch': 0.28} + + 30%|██▉ | 307/1024 [13:57:43<34:13:09, 171.81s/it]INFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:22:43 [block_pool.py:292] Successfully reset prefix cache + + 30%|███ | 308/1024 [14:00:35<34:11:36, 171.92s/it] + {'loss': 0.0519, 'grad_norm': 0.006375293247401714, 'learning_rate': 1e-05, 'num_tokens': 267853880.0, 'completions/mean_length': 7029.2265625, 'completions/min_length': 851.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6727.45947265625, 'completions/min_terminated_length': 851.0, 'completions/max_terminated_length': 16216.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.27328038215637207, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020365029573440552, 'sampling/sampling_logp_difference/max': 4.542207717895508, 'sampling/importance_sampling_ratio/min': 0.010649868287146091, 'sampling/importance_sampling_ratio/mean': 1.000023365020752, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9139953926205635, 'clip_ratio/low_mean': 4.8845648166206956e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.811290921225009e-06, 'clip_ratio/high_max': 1.9245163684900035e-05, 'clip_ratio/region_mean': 5.365693925796222e-05, 'epoch': 0.28} + + 30%|███ | 308/1024 [14:00:35<34:11:36, 171.92s/it]INFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:25:35 [block_pool.py:292] Successfully reset prefix cache + + 30%|███ | 309/1024 [14:03:09<33:05:43, 166.63s/it] + {'loss': 0.0733, 'grad_norm': 0.003697809297591448, 'learning_rate': 1e-05, 'num_tokens': 268665721.0, 'completions/mean_length': 6188.0078125, 'completions/min_length': 612.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5943.30419921875, 'completions/min_terminated_length': 612.0, 'completions/max_terminated_length': 16106.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.20699402689933777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.016581017524003983, 'sampling/sampling_logp_difference/max': 3.531106472015381, 'sampling/importance_sampling_ratio/min': 0.02927250787615776, 'sampling/importance_sampling_ratio/mean': 0.9999372363090515, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7640773430466652, 'clip_ratio/low_mean': 2.5999243803198624e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2031262031086953e-06, 'clip_ratio/high_max': 4.812504812434781e-06, 'clip_ratio/region_mean': 2.720237000630732e-05, 'epoch': 0.28} + + 30%|███ | 309/1024 [14:03:09<33:05:43, 166.63s/it]INFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:28:09 [block_pool.py:292] Successfully reset prefix cache + + 30%|███ | 310/1024 [14:06:11<33:57:39, 171.23s/it] + {'loss': 0.0763, 'grad_norm': 0.002286596456542611, 'learning_rate': 1e-05, 'num_tokens': 269726181.0, 'completions/mean_length': 8128.21875, 'completions/min_length': 1227.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7861.90283203125, 'completions/min_terminated_length': 1227.0, 'completions/max_terminated_length': 16264.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.24435341358184814, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019403984770178795, 'sampling/sampling_logp_difference/max': 12.90043830871582, 'sampling/importance_sampling_ratio/min': 2.4969556307041785e-06, 'sampling/importance_sampling_ratio/mean': 0.9999798536300659, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8218234181404114, 'clip_ratio/low_mean': 2.1358927824621787e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.1358927824621787e-05, 'epoch': 0.29} + + 30%|███ | 310/1024 [14:06:11<33:57:39, 171.23s/it]INFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:31:11 [block_pool.py:292] Successfully reset prefix cache + + 30%|███ | 311/1024 [14:08:33<32:08:31, 162.29s/it] + {'loss': 0.0193, 'grad_norm': 0.00485506234690547, 'learning_rate': 1e-05, 'num_tokens': 270470616.0, 'completions/mean_length': 5673.3359375, 'completions/min_length': 306.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5503.32568359375, 'completions/min_terminated_length': 306.0, 'completions/max_terminated_length': 16256.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.25354230403900146, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01881871558725834, 'sampling/sampling_logp_difference/max': 6.999490737915039, 'sampling/importance_sampling_ratio/min': 0.0009123464697040617, 'sampling/importance_sampling_ratio/mean': 1.0000226497650146, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9275510385632515, 'clip_ratio/low_mean': 3.0363167581981543e-05, 'clip_ratio/low_min': 6.364238288369961e-06, 'clip_ratio/high_mean': 3.7021193293185206e-06, 'clip_ratio/high_max': 1.4808477317274082e-05, 'clip_ratio/region_mean': 3.4065286854456645e-05, 'epoch': 0.29} + + 30%|███ | 311/1024 [14:08:33<32:08:31, 162.29s/it]INFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:33:32 [block_pool.py:292] Successfully reset prefix cache + + 30%|███ | 312/1024 [14:11:32<33:04:54, 167.27s/it] + {'loss': 0.032, 'grad_norm': 0.005874342750757933, 'learning_rate': 1e-05, 'num_tokens': 271377723.0, 'completions/mean_length': 6944.8984375, 'completions/min_length': 896.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6795.07177734375, 'completions/min_terminated_length': 896.0, 'completions/max_terminated_length': 16382.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020590776577591896, 'sampling/sampling_logp_difference/max': 10.049861907958984, 'sampling/importance_sampling_ratio/min': 4.3191710574319586e-05, 'sampling/importance_sampling_ratio/mean': 1.0000594854354858, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9335741624236107, 'clip_ratio/low_mean': 3.968570712231667e-05, 'clip_ratio/low_min': 3.4213767321489286e-06, 'clip_ratio/high_mean': 3.6739949109687586e-06, 'clip_ratio/high_max': 1.1274602456978755e-05, 'clip_ratio/region_mean': 4.335970191959859e-05, 'epoch': 0.29} + + 30%|███ | 312/1024 [14:11:32<33:04:54, 167.27s/it]INFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:36:31 [block_pool.py:292] Successfully reset prefix cache + + 31%|███ | 313/1024 [14:14:25<33:22:48, 169.01s/it] + {'loss': 0.06, 'grad_norm': 0.001684082904830575, 'learning_rate': 1e-05, 'num_tokens': 272384891.0, 'completions/mean_length': 7705.625, 'completions/min_length': 329.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7278.8193359375, 'completions/min_terminated_length': 329.0, 'completions/max_terminated_length': 15806.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2648528814315796, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020136822015047073, 'sampling/sampling_logp_difference/max': 9.624967575073242, 'sampling/importance_sampling_ratio/min': 6.605865200981498e-05, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8491624072194099, 'clip_ratio/low_mean': 3.206376845810155e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.16031673719408e-06, 'clip_ratio/high_max': 1.264126694877632e-05, 'clip_ratio/region_mean': 3.522408474054828e-05, 'epoch': 0.29} + + 31%|███ | 313/1024 [14:14:25<33:22:48, 169.01s/it]INFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:39:24 [block_pool.py:292] Successfully reset prefix cache + + 31%|███ | 314/1024 [14:17:10<33:05:18, 167.77s/it] + {'loss': 0.1233, 'grad_norm': 0.003692191792652011, 'learning_rate': 1e-05, 'num_tokens': 273251630.0, 'completions/mean_length': 6611.1484375, 'completions/min_length': 1116.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6534.19677734375, 'completions/min_terminated_length': 1116.0, 'completions/max_terminated_length': 15923.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.27564430236816406, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019237037748098373, 'sampling/sampling_logp_difference/max': 5.774331569671631, 'sampling/importance_sampling_ratio/min': 0.0031062732450664043, 'sampling/importance_sampling_ratio/mean': 0.9999606609344482, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8867302760481834, 'clip_ratio/low_mean': 3.8573590472879005e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.443089442749624e-06, 'clip_ratio/high_max': 9.772357770998497e-06, 'clip_ratio/region_mean': 4.101667946088128e-05, 'epoch': 0.29} + + 31%|███ | 314/1024 [14:17:10<33:05:18, 167.77s/it]INFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:42:09 [block_pool.py:292] Successfully reset prefix cache + + 31%|███ | 315/1024 [14:19:49<32:32:32, 165.24s/it] + {'loss': -0.0072, 'grad_norm': 0.004167635925114155, 'learning_rate': 1e-05, 'num_tokens': 274146482.0, 'completions/mean_length': 6770.46875, 'completions/min_length': 957.0, 'completions/max_length': 15786.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6770.46875, 'completions/min_terminated_length': 957.0, 'completions/max_terminated_length': 15786.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.23486016690731049, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019684650003910065, 'sampling/sampling_logp_difference/max': 9.18593978881836, 'sampling/importance_sampling_ratio/min': 0.00010247006866848096, 'sampling/importance_sampling_ratio/mean': 1.000013828277588, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8252957463264465, 'clip_ratio/low_mean': 1.7575501146893657e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.664363972206047e-06, 'clip_ratio/high_max': 3.0103737344688852e-05, 'clip_ratio/region_mean': 2.723986426644842e-05, 'epoch': 0.29} + + 31%|███ | 315/1024 [14:19:49<32:32:32, 165.24s/it]INFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:44:49 [block_pool.py:292] Successfully reset prefix cache + + 31%|███ | 316/1024 [14:22:58<33:52:58, 172.29s/it] + {'loss': 0.0531, 'grad_norm': 0.0030363225378096104, 'learning_rate': 1e-05, 'num_tokens': 275214040.0, 'completions/mean_length': 8210.859375, 'completions/min_length': 891.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7365.36181640625, 'completions/min_terminated_length': 891.0, 'completions/max_terminated_length': 15827.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019275270402431488, 'sampling/sampling_logp_difference/max': 5.858705997467041, 'sampling/importance_sampling_ratio/min': 0.002854935359209776, 'sampling/importance_sampling_ratio/mean': 0.9998943209648132, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8118235394358635, 'clip_ratio/low_mean': 3.877351048231503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6323651834682096e-06, 'clip_ratio/high_max': 6.529460733872838e-06, 'clip_ratio/region_mean': 4.040587566578324e-05, 'epoch': 0.29} + + 31%|███ | 316/1024 [14:22:58<33:52:58, 172.29s/it]INFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:47:57 [block_pool.py:292] Successfully reset prefix cache + + 31%|███ | 317/1024 [14:26:01<34:27:13, 175.44s/it] + {'loss': 0.0404, 'grad_norm': 0.004777858033776283, 'learning_rate': 1e-05, 'num_tokens': 276138049.0, 'completions/mean_length': 7072.8828125, 'completions/min_length': 374.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6849.41650390625, 'completions/min_terminated_length': 374.0, 'completions/max_terminated_length': 14900.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01849908009171486, 'sampling/sampling_logp_difference/max': 5.860339164733887, 'sampling/importance_sampling_ratio/min': 0.0028502768836915493, 'sampling/importance_sampling_ratio/mean': 0.9999368190765381, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8018335327506065, 'clip_ratio/low_mean': 2.3981688286767167e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7700157286526519e-06, 'clip_ratio/high_max': 7.0800629146106075e-06, 'clip_ratio/region_mean': 2.5751703674359305e-05, 'epoch': 0.29} + + 31%|███ | 317/1024 [14:26:01<34:27:13, 175.44s/it]INFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:51:00 [block_pool.py:292] Successfully reset prefix cache + + 31%|███ | 318/1024 [14:28:37<33:16:43, 169.69s/it] + {'loss': 0.1331, 'grad_norm': 0.0030593445990234613, 'learning_rate': 1e-05, 'num_tokens': 276910124.0, 'completions/mean_length': 5889.8359375, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5723.26220703125, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 14447.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.3621976971626282, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01834402233362198, 'sampling/sampling_logp_difference/max': 8.874987602233887, 'sampling/importance_sampling_ratio/min': 0.000139843366923742, 'sampling/importance_sampling_ratio/mean': 0.9999091029167175, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7976400703191757, 'clip_ratio/low_mean': 4.28424866640853e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.648421506521117e-06, 'clip_ratio/high_max': 2.259368602608447e-05, 'clip_ratio/region_mean': 4.849090737479855e-05, 'epoch': 0.29} + + 31%|███ | 318/1024 [14:28:37<33:16:43, 169.69s/it]INFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:53:36 [block_pool.py:292] Successfully reset prefix cache + + 31%|███ | 319/1024 [14:31:38<33:54:05, 173.11s/it] + {'loss': 0.077, 'grad_norm': 0.004245694726705551, 'learning_rate': 1e-05, 'num_tokens': 277843542.0, 'completions/mean_length': 7144.265625, 'completions/min_length': 1200.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6689.85205078125, 'completions/min_terminated_length': 1200.0, 'completions/max_terminated_length': 16324.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.24541422724723816, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01875344291329384, 'sampling/sampling_logp_difference/max': 11.499897956848145, 'sampling/importance_sampling_ratio/min': 1.0131127055501565e-05, 'sampling/importance_sampling_ratio/mean': 0.9998534321784973, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8309404999017715, 'clip_ratio/low_mean': 2.377464920755301e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.663561756184208e-06, 'clip_ratio/high_max': 1.4654247024736833e-05, 'clip_ratio/region_mean': 2.7438210736363544e-05, 'epoch': 0.29} + + 31%|███ | 319/1024 [14:31:38<33:54:05, 173.11s/it]INFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:56:38 [block_pool.py:292] Successfully reset prefix cache + + 31%|███▏ | 320/1024 [14:34:24<33:27:11, 171.07s/it] + {'loss': 0.0723, 'grad_norm': 0.0035574575886130333, 'learning_rate': 1e-05, 'num_tokens': 278730129.0, 'completions/mean_length': 6779.5234375, 'completions/min_length': 767.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6703.8974609375, 'completions/min_terminated_length': 767.0, 'completions/max_terminated_length': 15722.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.32825323939323425, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02137477695941925, 'sampling/sampling_logp_difference/max': 5.151239395141602, 'sampling/importance_sampling_ratio/min': 0.005792221520096064, 'sampling/importance_sampling_ratio/mean': 0.9999299645423889, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9584890529513359, 'clip_ratio/low_mean': 4.735765514851664e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.241558604509919e-06, 'clip_ratio/high_max': 6.252500952541595e-06, 'clip_ratio/region_mean': 4.9599213525652885e-05, 'epoch': 0.29} + + 31%|███▏ | 320/1024 [14:34:24<33:27:11, 171.07s/it]INFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 03:59:27 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 31%|███▏ | 321/1024 [14:36:54<32:11:12, 164.83s/it] + {'loss': 0.0331, 'grad_norm': 0.0037982752546668053, 'learning_rate': 1e-05, 'num_tokens': 279462542.0, 'completions/mean_length': 5582.9765625, 'completions/min_length': 781.0, 'completions/max_length': 15892.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5582.9765625, 'completions/min_terminated_length': 781.0, 'completions/max_terminated_length': 15892.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3164186477661133, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01906203106045723, 'sampling/sampling_logp_difference/max': 6.124997138977051, 'sampling/importance_sampling_ratio/min': 0.0021874974481761456, 'sampling/importance_sampling_ratio/mean': 0.9999780058860779, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8629376217722893, 'clip_ratio/low_mean': 2.195712454522436e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.518853403278627e-06, 'clip_ratio/high_max': 3.2948471016425174e-05, 'clip_ratio/region_mean': 3.14759782895635e-05, 'epoch': 0.3} + + 31%|███▏ | 321/1024 [14:36:54<32:11:12, 164.83s/it]INFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:01:54 [block_pool.py:292] Successfully reset prefix cache + + 31%|███▏ | 322/1024 [14:39:55<33:02:52, 169.48s/it] + {'loss': 0.0585, 'grad_norm': 0.0027678858023136854, 'learning_rate': 1e-05, 'num_tokens': 280370207.0, 'completions/mean_length': 6942.2578125, 'completions/min_length': 1156.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6477.90966796875, 'completions/min_terminated_length': 1156.0, 'completions/max_terminated_length': 16204.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.3066929280757904, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01940828748047352, 'sampling/sampling_logp_difference/max': 8.3748779296875, 'sampling/importance_sampling_ratio/min': 0.00023058800434228033, 'sampling/importance_sampling_ratio/mean': 0.9998471736907959, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8147861957550049, 'clip_ratio/low_mean': 5.367386921761863e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.757368406491878e-06, 'clip_ratio/high_max': 1.1029473625967512e-05, 'clip_ratio/region_mean': 5.6431237737797346e-05, 'epoch': 0.3} + + 31%|███▏ | 322/1024 [14:39:55<33:02:52, 169.48s/it]INFO 12-02 04:04:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:04:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:04:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:04:55 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 323/1024 [14:42:30<32:10:12, 165.21s/it] + {'loss': 0.0839, 'grad_norm': 0.00577945914119482, 'learning_rate': 1e-05, 'num_tokens': 281189491.0, 'completions/mean_length': 6242.53125, 'completions/min_length': 1220.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5915.38671875, 'completions/min_terminated_length': 1220.0, 'completions/max_terminated_length': 15782.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2398776262998581, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01948760263621807, 'sampling/sampling_logp_difference/max': 9.2499418258667, 'sampling/importance_sampling_ratio/min': 9.611724817659706e-05, 'sampling/importance_sampling_ratio/mean': 0.9999679327011108, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.878915011882782, 'clip_ratio/low_mean': 3.232976985145797e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.765707434577052e-06, 'clip_ratio/high_max': 2.6367894406575942e-05, 'clip_ratio/region_mean': 4.109547796815605e-05, 'epoch': 0.3} + + 32%|███▏ | 323/1024 [14:42:30<32:10:12, 165.21s/it]INFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:07:30 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 324/1024 [14:45:38<33:25:44, 171.92s/it] + {'loss': 0.0648, 'grad_norm': 0.0014128695474937558, 'learning_rate': 1e-05, 'num_tokens': 282103997.0, 'completions/mean_length': 7004.015625, 'completions/min_length': 224.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6622.71533203125, 'completions/min_terminated_length': 224.0, 'completions/max_terminated_length': 16310.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.21778053045272827, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019019678235054016, 'sampling/sampling_logp_difference/max': 6.011474609375, 'sampling/importance_sampling_ratio/min': 0.0024504722096025944, 'sampling/importance_sampling_ratio/mean': 0.9999747276306152, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7964659407734871, 'clip_ratio/low_mean': 1.833109013205103e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1664920634757436e-05, 'clip_ratio/high_max': 3.50839609382092e-05, 'clip_ratio/region_mean': 2.9996010880495305e-05, 'epoch': 0.3} + + 32%|███▏ | 324/1024 [14:45:38<33:25:44, 171.92s/it]INFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:10:37 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 325/1024 [14:48:25<33:06:48, 170.54s/it] + {'loss': 0.0515, 'grad_norm': 0.002476039342582226, 'learning_rate': 1e-05, 'num_tokens': 283122382.0, 'completions/mean_length': 7822.6953125, 'completions/min_length': 575.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7546.52392578125, 'completions/min_terminated_length': 575.0, 'completions/max_terminated_length': 15318.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020557202398777008, 'sampling/sampling_logp_difference/max': 6.930576324462891, 'sampling/importance_sampling_ratio/min': 0.0009774373611435294, 'sampling/importance_sampling_ratio/mean': 0.9999314546585083, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8571138679981232, 'clip_ratio/low_mean': 5.309064226821647e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.580651363994548e-06, 'clip_ratio/high_max': 1.832260545597819e-05, 'clip_ratio/region_mean': 5.767129368905444e-05, 'epoch': 0.3} + + 32%|███▏ | 325/1024 [14:48:25<33:06:48, 170.54s/it]INFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:13:25 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 326/1024 [14:51:27<33:44:26, 174.02s/it] + {'loss': 0.043, 'grad_norm': 0.005309853237122297, 'learning_rate': 1e-05, 'num_tokens': 284130081.0, 'completions/mean_length': 7738.8984375, 'completions/min_length': 897.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 6844.57763671875, 'completions/min_terminated_length': 897.0, 'completions/max_terminated_length': 16319.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01873316988348961, 'sampling/sampling_logp_difference/max': 8.933455467224121, 'sampling/importance_sampling_ratio/min': 0.0001319014554610476, 'sampling/importance_sampling_ratio/mean': 0.9998971223831177, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7839021533727646, 'clip_ratio/low_mean': 4.19679121819172e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4346049965752172e-06, 'clip_ratio/high_max': 5.738419986300869e-06, 'clip_ratio/region_mean': 4.3402517292179255e-05, 'epoch': 0.3} + + 32%|███▏ | 326/1024 [14:51:27<33:44:26, 174.02s/it]INFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:16:27 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 327/1024 [14:54:13<33:11:44, 171.46s/it] + {'loss': 0.0574, 'grad_norm': 0.004228116944432259, 'learning_rate': 1e-05, 'num_tokens': 285058720.0, 'completions/mean_length': 7102.2421875, 'completions/min_length': 529.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6954.9130859375, 'completions/min_terminated_length': 529.0, 'completions/max_terminated_length': 15952.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019325006753206253, 'sampling/sampling_logp_difference/max': 8.951294898986816, 'sampling/importance_sampling_ratio/min': 0.00012956927821505815, 'sampling/importance_sampling_ratio/mean': 0.9999712705612183, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8530801385641098, 'clip_ratio/low_mean': 4.043528815600439e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5177145062116324e-06, 'clip_ratio/high_max': 1.007085802484653e-05, 'clip_ratio/region_mean': 4.295300277590286e-05, 'epoch': 0.3} + + 32%|███▏ | 327/1024 [14:54:13<33:11:44, 171.46s/it]INFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:19:12 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 328/1024 [14:56:54<32:32:24, 168.31s/it] + {'loss': 0.0374, 'grad_norm': 0.004967439454048872, 'learning_rate': 1e-05, 'num_tokens': 285919765.0, 'completions/mean_length': 6583.4765625, 'completions/min_length': 718.0, 'completions/max_length': 15594.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6583.4765625, 'completions/min_terminated_length': 718.0, 'completions/max_terminated_length': 15594.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021393200382590294, 'sampling/sampling_logp_difference/max': 4.093823432922363, 'sampling/importance_sampling_ratio/min': 0.016675354912877083, 'sampling/importance_sampling_ratio/mean': 1.00004243850708, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.021921381354332, 'clip_ratio/low_mean': 3.661125703047219e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0171863777941326e-06, 'clip_ratio/high_max': 4.06874551117653e-06, 'clip_ratio/region_mean': 3.762844340826632e-05, 'epoch': 0.3} + + 32%|███▏ | 328/1024 [14:56:54<32:32:24, 168.31s/it]INFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:21:53 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 329/1024 [15:00:04<33:46:42, 174.97s/it] + {'loss': 0.0805, 'grad_norm': 0.004189736675471067, 'learning_rate': 1e-05, 'num_tokens': 286935512.0, 'completions/mean_length': 7770.5859375, 'completions/min_length': 1040.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7346.97509765625, 'completions/min_terminated_length': 1040.0, 'completions/max_terminated_length': 16299.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2369818240404129, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021805983036756516, 'sampling/sampling_logp_difference/max': 4.449572563171387, 'sampling/importance_sampling_ratio/min': 0.011683559976518154, 'sampling/importance_sampling_ratio/mean': 0.9999797344207764, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0466903448104858, 'clip_ratio/low_mean': 4.05305947879242e-05, 'clip_ratio/low_min': 4.215567059873138e-06, 'clip_ratio/high_mean': 3.053812861253391e-06, 'clip_ratio/high_max': 1.2215251445013564e-05, 'clip_ratio/region_mean': 4.358440742180392e-05, 'epoch': 0.3} + + 32%|███▏ | 329/1024 [15:00:04<33:46:42, 174.97s/it]INFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:25:04 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 330/1024 [15:02:30<32:04:21, 166.37s/it] + {'loss': 0.0635, 'grad_norm': 0.0032866497058421373, 'learning_rate': 1e-05, 'num_tokens': 287681943.0, 'completions/mean_length': 5689.2421875, 'completions/min_length': 1194.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5432.568359375, 'completions/min_terminated_length': 1194.0, 'completions/max_terminated_length': 15758.0, 'rewards/accuracy_reward/mean': 0.640625, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.640625, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01789461076259613, 'sampling/sampling_logp_difference/max': 7.873311519622803, 'sampling/importance_sampling_ratio/min': 0.00038077132194302976, 'sampling/importance_sampling_ratio/mean': 0.999940812587738, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7778806164860725, 'clip_ratio/low_mean': 1.8177100628236076e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.141844553691044e-06, 'clip_ratio/high_max': 2.0567378214764176e-05, 'clip_ratio/region_mean': 2.3318944840866607e-05, 'epoch': 0.3} + + 32%|███▏ | 330/1024 [15:02:30<32:04:21, 166.37s/it]INFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:27:30 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 331/1024 [15:04:52<30:36:47, 159.03s/it] + {'loss': 0.0746, 'grad_norm': 0.0023572889622300863, 'learning_rate': 1e-05, 'num_tokens': 288506735.0, 'completions/mean_length': 6288.1875, 'completions/min_length': 751.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6127.93701171875, 'completions/min_terminated_length': 751.0, 'completions/max_terminated_length': 13820.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3066929280757904, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017407266423106194, 'sampling/sampling_logp_difference/max': 7.749598503112793, 'sampling/importance_sampling_ratio/min': 0.000430915504693985, 'sampling/importance_sampling_ratio/mean': 0.9999474287033081, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7709921672940254, 'clip_ratio/low_mean': 3.1423560130861006e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.772717253828887e-06, 'clip_ratio/high_max': 3.109086901531555e-05, 'clip_ratio/region_mean': 3.919627738468989e-05, 'epoch': 0.3} + + 32%|███▏ | 331/1024 [15:04:52<30:36:47, 159.03s/it]INFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:29:52 [block_pool.py:292] Successfully reset prefix cache + + 32%|███▏ | 332/1024 [15:07:39<31:01:48, 161.43s/it] + {'loss': 0.0986, 'grad_norm': 0.0034220058005303144, 'learning_rate': 1e-05, 'num_tokens': 289395498.0, 'completions/mean_length': 6775.0234375, 'completions/min_length': 655.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6465.05615234375, 'completions/min_terminated_length': 655.0, 'completions/max_terminated_length': 16318.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.34533774852752686, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019930530339479446, 'sampling/sampling_logp_difference/max': 3.449552536010742, 'sampling/importance_sampling_ratio/min': 0.0317598432302475, 'sampling/importance_sampling_ratio/mean': 0.9999603033065796, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9338318258523941, 'clip_ratio/low_mean': 6.26047980176736e-05, 'clip_ratio/low_min': 5.51267930859467e-06, 'clip_ratio/high_mean': 9.51674803673086e-06, 'clip_ratio/high_max': 3.4638953366084024e-05, 'clip_ratio/region_mean': 7.212154741864651e-05, 'epoch': 0.31} + + 32%|███▏ | 332/1024 [15:07:39<31:01:48, 161.43s/it]INFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:32:39 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 333/1024 [15:10:18<30:48:09, 160.48s/it] + {'loss': 0.0262, 'grad_norm': 0.002513247774913907, 'learning_rate': 1e-05, 'num_tokens': 290329082.0, 'completions/mean_length': 7142.9375, 'completions/min_length': 707.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6844.83837890625, 'completions/min_terminated_length': 707.0, 'completions/max_terminated_length': 15295.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022366533055901527, 'sampling/sampling_logp_difference/max': 14.969992637634277, 'sampling/importance_sampling_ratio/min': 3.152207455059397e-07, 'sampling/importance_sampling_ratio/mean': 0.9999737739562988, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.971405878663063, 'clip_ratio/low_mean': 7.159989991123439e-05, 'clip_ratio/low_min': 1.5592839645250933e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 7.159989991123439e-05, 'epoch': 0.31} + + 33%|███▎ | 333/1024 [15:10:18<30:48:09, 160.48s/it]INFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:35:17 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 334/1024 [15:13:11<31:29:07, 164.27s/it] + {'loss': 0.0204, 'grad_norm': 0.0056767817586660385, 'learning_rate': 1e-05, 'num_tokens': 291170133.0, 'completions/mean_length': 6412.2109375, 'completions/min_length': 544.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6333.69287109375, 'completions/min_terminated_length': 544.0, 'completions/max_terminated_length': 15581.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.15650184452533722, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.020012658089399338, 'sampling/sampling_logp_difference/max': 7.687117099761963, 'sampling/importance_sampling_ratio/min': 0.000458698661532253, 'sampling/importance_sampling_ratio/mean': 0.9999720454216003, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9136044681072235, 'clip_ratio/low_mean': 1.7493430505055585e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.126938051740581e-06, 'clip_ratio/high_max': 1.6507752206962323e-05, 'clip_ratio/region_mean': 2.1620368215735652e-05, 'epoch': 0.31} + + 33%|███▎ | 334/1024 [15:13:11<31:29:07, 164.27s/it]INFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:38:10 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 335/1024 [15:16:10<32:20:01, 168.94s/it] + {'loss': 0.0432, 'grad_norm': 0.00243841833434999, 'learning_rate': 1e-05, 'num_tokens': 292222082.0, 'completions/mean_length': 8066.1015625, 'completions/min_length': 497.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7797.7822265625, 'completions/min_terminated_length': 497.0, 'completions/max_terminated_length': 16111.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2688046097755432, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.023650091141462326, 'sampling/sampling_logp_difference/max': 9.374991416931152, 'sampling/importance_sampling_ratio/min': 8.481895929435268e-05, 'sampling/importance_sampling_ratio/mean': 0.9999664425849915, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0789504647254944, 'clip_ratio/low_mean': 3.6938338666914206e-05, 'clip_ratio/low_min': 5.699044777429663e-06, 'clip_ratio/high_mean': 2.0652136072385474e-06, 'clip_ratio/high_max': 8.26085442895419e-06, 'clip_ratio/region_mean': 3.900355193309224e-05, 'epoch': 0.31} + + 33%|███▎ | 335/1024 [15:16:10<32:20:01, 168.94s/it]INFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:41:10 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 336/1024 [15:18:50<31:43:31, 166.00s/it] + {'loss': 0.0372, 'grad_norm': 0.0020856577903032303, 'learning_rate': 1e-05, 'num_tokens': 293115984.0, 'completions/mean_length': 6836.046875, 'completions/min_length': 785.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6606.896484375, 'completions/min_terminated_length': 785.0, 'completions/max_terminated_length': 15176.0, 'rewards/accuracy_reward/mean': 0.21875, 'rewards/accuracy_reward/std': 0.41502299904823303, 'reward': 0.21875, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.022012067958712578, 'sampling/sampling_logp_difference/max': 10.488847732543945, 'sampling/importance_sampling_ratio/min': 2.784526441246271e-05, 'sampling/importance_sampling_ratio/mean': 0.9999911785125732, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.218759760260582, 'clip_ratio/low_mean': 1.9117383317279746e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.330013674305519e-06, 'clip_ratio/high_max': 5.320054697222076e-06, 'clip_ratio/region_mean': 2.0447396991585265e-05, 'epoch': 0.31} + + 33%|███▎ | 336/1024 [15:18:50<31:43:31, 166.00s/it]INFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:43:49 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 337/1024 [15:21:57<32:52:51, 172.30s/it] + {'loss': 0.0354, 'grad_norm': 0.005163854919373989, 'learning_rate': 1e-05, 'num_tokens': 294099503.0, 'completions/mean_length': 7501.9921875, 'completions/min_length': 1237.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7140.9345703125, 'completions/min_terminated_length': 1237.0, 'completions/max_terminated_length': 15796.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.30904707312583923, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020813245326280594, 'sampling/sampling_logp_difference/max': 7.331547260284424, 'sampling/importance_sampling_ratio/min': 0.0006545600481331348, 'sampling/importance_sampling_ratio/mean': 0.9999276399612427, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8940394818782806, 'clip_ratio/low_mean': 4.6741323160404136e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.549717793153832e-06, 'clip_ratio/high_max': 2.5695502699818462e-05, 'clip_ratio/region_mean': 5.429104089671455e-05, 'epoch': 0.31} + + 33%|███▎ | 337/1024 [15:21:57<32:52:51, 172.30s/it]INFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:46:56 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 338/1024 [15:24:54<33:07:29, 173.83s/it] + {'loss': 0.0963, 'grad_norm': 0.0029277894645929337, 'learning_rate': 1e-05, 'num_tokens': 295042105.0, 'completions/mean_length': 7204.828125, 'completions/min_length': 846.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6908.7255859375, 'completions/min_terminated_length': 846.0, 'completions/max_terminated_length': 16034.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.28801077604293823, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020187582820653915, 'sampling/sampling_logp_difference/max': 10.872637748718262, 'sampling/importance_sampling_ratio/min': 1.8970265955431387e-05, 'sampling/importance_sampling_ratio/mean': 1.0000677108764648, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9961872175335884, 'clip_ratio/low_mean': 4.5567895540443715e-05, 'clip_ratio/low_min': 4.458871444512624e-06, 'clip_ratio/high_mean': 9.45794374729303e-06, 'clip_ratio/high_max': 3.1606674838258186e-05, 'clip_ratio/region_mean': 5.502583962879726e-05, 'epoch': 0.31} + + 33%|███▎ | 338/1024 [15:24:54<33:07:29, 173.83s/it]INFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:49:54 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 339/1024 [15:27:38<32:30:54, 170.88s/it] + {'loss': 0.0473, 'grad_norm': 0.0032952844630926847, 'learning_rate': 1e-05, 'num_tokens': 295867039.0, 'completions/mean_length': 6256.859375, 'completions/min_length': 1006.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6013.80810546875, 'completions/min_terminated_length': 1006.0, 'completions/max_terminated_length': 15856.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.24670752882957458, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019491540268063545, 'sampling/sampling_logp_difference/max': 9.434039115905762, 'sampling/importance_sampling_ratio/min': 7.995560008566827e-05, 'sampling/importance_sampling_ratio/mean': 0.9999649524688721, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9293600022792816, 'clip_ratio/low_mean': 1.8380221035840805e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.194059781388205e-06, 'clip_ratio/high_max': 1.7963964182854397e-05, 'clip_ratio/region_mean': 2.357428081722901e-05, 'epoch': 0.31} + + 33%|███▎ | 339/1024 [15:27:38<32:30:54, 170.88s/it]INFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:52:38 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 340/1024 [15:30:30<32:33:21, 171.35s/it] + {'loss': 0.0621, 'grad_norm': 0.0029417150653898716, 'learning_rate': 1e-05, 'num_tokens': 296832843.0, 'completions/mean_length': 7397.84375, 'completions/min_length': 923.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7032.552734375, 'completions/min_terminated_length': 923.0, 'completions/max_terminated_length': 15412.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2867125868797302, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01975393109023571, 'sampling/sampling_logp_difference/max': 10.93724250793457, 'sampling/importance_sampling_ratio/min': 1.7783446310204454e-05, 'sampling/importance_sampling_ratio/mean': 1.0000183582305908, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8508890569210052, 'clip_ratio/low_mean': 2.7479814093567256e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.8943877648780472e-06, 'clip_ratio/high_max': 7.577551059512189e-06, 'clip_ratio/region_mean': 2.9374201631071628e-05, 'epoch': 0.31} + + 33%|███▎ | 340/1024 [15:30:30<32:33:21, 171.35s/it]INFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:55:30 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 341/1024 [15:33:07<31:39:28, 166.86s/it] + {'loss': 0.0664, 'grad_norm': 0.0026788609102368355, 'learning_rate': 1e-05, 'num_tokens': 297735285.0, 'completions/mean_length': 6897.765625, 'completions/min_length': 371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6823.07080078125, 'completions/min_terminated_length': 371.0, 'completions/max_terminated_length': 14983.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3266732692718506, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020578179508447647, 'sampling/sampling_logp_difference/max': 6.370794296264648, 'sampling/importance_sampling_ratio/min': 0.001710799871943891, 'sampling/importance_sampling_ratio/mean': 0.999909520149231, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9046694040298462, 'clip_ratio/low_mean': 5.109179869577929e-05, 'clip_ratio/low_min': 6.657612175331451e-06, 'clip_ratio/high_mean': 1.3302957199812226e-05, 'clip_ratio/high_max': 3.281225508544594e-05, 'clip_ratio/region_mean': 6.439475532715733e-05, 'epoch': 0.31} + + 33%|███▎ | 341/1024 [15:33:07<31:39:28, 166.86s/it]INFO 12-02 04:58:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:58:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:58:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 04:58:07 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 342/1024 [15:36:11<32:34:06, 171.92s/it] + {'loss': 0.0923, 'grad_norm': 0.005915141198784113, 'learning_rate': 1e-05, 'num_tokens': 298645124.0, 'completions/mean_length': 6971.9921875, 'completions/min_length': 6.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6509.10595703125, 'completions/min_terminated_length': 6.0, 'completions/max_terminated_length': 15525.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3742823898792267, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01906151883304119, 'sampling/sampling_logp_difference/max': 6.937347412109375, 'sampling/importance_sampling_ratio/min': 0.000970841443631798, 'sampling/importance_sampling_ratio/mean': 0.9999268651008606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8658201694488525, 'clip_ratio/low_mean': 7.019768918326008e-05, 'clip_ratio/low_min': 2.541147478041239e-05, 'clip_ratio/high_mean': 5.168538336874917e-06, 'clip_ratio/high_max': 1.7319889593636617e-05, 'clip_ratio/region_mean': 7.53662266106403e-05, 'epoch': 0.31} + + 33%|███▎ | 342/1024 [15:36:11<32:34:06, 171.92s/it]INFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:01:10 [block_pool.py:292] Successfully reset prefix cache + + 33%|███▎ | 343/1024 [15:38:55<32:06:03, 169.70s/it] + {'loss': 0.097, 'grad_norm': 0.0032792428974062204, 'learning_rate': 1e-05, 'num_tokens': 299503781.0, 'completions/mean_length': 6545.6953125, 'completions/min_length': 800.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5889.80859375, 'completions/min_terminated_length': 800.0, 'completions/max_terminated_length': 15054.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.38293448090553284, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017413027584552765, 'sampling/sampling_logp_difference/max': 6.124998092651367, 'sampling/importance_sampling_ratio/min': 0.002187495119869709, 'sampling/importance_sampling_ratio/mean': 0.9999361634254456, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.779609851539135, 'clip_ratio/low_mean': 6.167940273371642e-05, 'clip_ratio/low_min': 5.969151516183047e-06, 'clip_ratio/high_mean': 4.583216309583804e-06, 'clip_ratio/high_max': 1.8332865238335216e-05, 'clip_ratio/region_mean': 6.626261847486603e-05, 'epoch': 0.32} + + 33%|███▎ | 343/1024 [15:38:55<32:06:03, 169.70s/it]INFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:03:55 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▎ | 344/1024 [15:41:50<32:20:24, 171.21s/it] + {'loss': 0.1102, 'grad_norm': 0.005092279519885778, 'learning_rate': 1e-05, 'num_tokens': 300447903.0, 'completions/mean_length': 7226.515625, 'completions/min_length': 454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7006.736328125, 'completions/min_terminated_length': 454.0, 'completions/max_terminated_length': 15318.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2998581528663635, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021570835262537003, 'sampling/sampling_logp_difference/max': 7.374476909637451, 'sampling/importance_sampling_ratio/min': 0.000627054600045085, 'sampling/importance_sampling_ratio/mean': 0.9999373555183411, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9573849961161613, 'clip_ratio/low_mean': 4.46246323235755e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.46246323235755e-05, 'epoch': 0.32} + + 34%|███▎ | 344/1024 [15:41:50<32:20:24, 171.21s/it]INFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:06:49 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▎ | 345/1024 [15:44:30<31:41:16, 168.01s/it] + {'loss': 0.0655, 'grad_norm': 0.005033228080719709, 'learning_rate': 1e-05, 'num_tokens': 301206021.0, 'completions/mean_length': 5755.171875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5323.10546875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 14967.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3424547016620636, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018870476633310318, 'sampling/sampling_logp_difference/max': 6.531146049499512, 'sampling/importance_sampling_ratio/min': 0.0014573346124961972, 'sampling/importance_sampling_ratio/mean': 0.9999947547912598, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8482184633612633, 'clip_ratio/low_mean': 4.7280102080549113e-05, 'clip_ratio/low_min': 1.0166083029616857e-05, 'clip_ratio/high_mean': 1.3718173477172968e-06, 'clip_ratio/high_max': 5.487269390869187e-06, 'clip_ratio/region_mean': 4.865191931457957e-05, 'epoch': 0.32} + + 34%|███▎ | 345/1024 [15:44:30<31:41:16, 168.01s/it]INFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:09:30 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 346/1024 [15:47:19<31:41:58, 168.32s/it] + {'loss': 0.0707, 'grad_norm': 0.007659573573619127, 'learning_rate': 1e-05, 'num_tokens': 302133890.0, 'completions/mean_length': 7098.7265625, 'completions/min_length': 947.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6875.88037109375, 'completions/min_terminated_length': 947.0, 'completions/max_terminated_length': 15509.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.23410367965698242, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019994346424937248, 'sampling/sampling_logp_difference/max': 6.687288761138916, 'sampling/importance_sampling_ratio/min': 0.0012466582702472806, 'sampling/importance_sampling_ratio/mean': 1.0000004768371582, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.87320177257061, 'clip_ratio/low_mean': 1.6510994441887306e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3553367352869827e-06, 'clip_ratio/high_max': 5.421346941147931e-06, 'clip_ratio/region_mean': 1.786633117717429e-05, 'epoch': 0.32} + + 34%|███▍ | 346/1024 [15:47:19<31:41:58, 168.32s/it]INFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:12:19 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 347/1024 [15:50:05<31:29:14, 167.44s/it] + {'loss': 0.0487, 'grad_norm': 0.0014135175151750445, 'learning_rate': 1e-05, 'num_tokens': 302972566.0, 'completions/mean_length': 6399.96875, 'completions/min_length': 364.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6077.90283203125, 'completions/min_terminated_length': 364.0, 'completions/max_terminated_length': 16139.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02073008380830288, 'sampling/sampling_logp_difference/max': 5.963917255401611, 'sampling/importance_sampling_ratio/min': 0.0025698256213217974, 'sampling/importance_sampling_ratio/mean': 0.9999452829360962, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9481896534562111, 'clip_ratio/low_mean': 3.8744643916288624e-05, 'clip_ratio/low_min': 6.108287834649673e-06, 'clip_ratio/high_mean': 2.8890573275930365e-06, 'clip_ratio/high_max': 1.1556229310372146e-05, 'clip_ratio/region_mean': 4.1633702039689524e-05, 'epoch': 0.32} + + 34%|███▍ | 347/1024 [15:50:05<31:29:14, 167.44s/it]INFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:15:04 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 348/1024 [15:52:57<31:41:34, 168.78s/it] + {'loss': 0.0126, 'grad_norm': 0.0027898226398974657, 'learning_rate': 1e-05, 'num_tokens': 303925976.0, 'completions/mean_length': 7298.078125, 'completions/min_length': 1009.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7226.53564453125, 'completions/min_terminated_length': 1009.0, 'completions/max_terminated_length': 16095.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020944103598594666, 'sampling/sampling_logp_difference/max': 5.252114772796631, 'sampling/importance_sampling_ratio/min': 0.005236432887613773, 'sampling/importance_sampling_ratio/mean': 0.9999772310256958, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8719206526875496, 'clip_ratio/low_mean': 4.620846755187813e-05, 'clip_ratio/low_min': 6.243132702365983e-06, 'clip_ratio/high_mean': 2.545892130001448e-06, 'clip_ratio/high_max': 6.59491388432798e-06, 'clip_ratio/region_mean': 4.875435956819274e-05, 'epoch': 0.32} + + 34%|███▍ | 348/1024 [15:52:57<31:41:34, 168.78s/it]INFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:17:56 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 349/1024 [15:55:24<30:25:36, 162.28s/it] + {'loss': 0.0248, 'grad_norm': 0.0012764945859089494, 'learning_rate': 1e-05, 'num_tokens': 304675157.0, 'completions/mean_length': 5667.0390625, 'completions/min_length': 974.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5496.9287109375, 'completions/min_terminated_length': 974.0, 'completions/max_terminated_length': 14980.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.17965976893901825, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018928447738289833, 'sampling/sampling_logp_difference/max': 12.195245742797852, 'sampling/importance_sampling_ratio/min': 5.054428584116977e-06, 'sampling/importance_sampling_ratio/mean': 1.0000383853912354, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8791451379656792, 'clip_ratio/low_mean': 2.010384196182713e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6303108597858227e-06, 'clip_ratio/high_max': 1.052124343914329e-05, 'clip_ratio/region_mean': 2.273415248055244e-05, 'epoch': 0.32} + + 34%|███▍ | 349/1024 [15:55:24<30:25:36, 162.28s/it]INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:20:23 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 350/1024 [15:57:41<28:58:05, 154.73s/it] + {'loss': 0.0221, 'grad_norm': 0.00509974779561162, 'learning_rate': 1e-05, 'num_tokens': 305447038.0, 'completions/mean_length': 5874.4453125, 'completions/min_length': 486.0, 'completions/max_length': 15354.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5874.4453125, 'completions/min_terminated_length': 486.0, 'completions/max_terminated_length': 15354.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.24777325987815857, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02114470861852169, 'sampling/sampling_logp_difference/max': 5.340880870819092, 'sampling/importance_sampling_ratio/min': 0.004791648127138615, 'sampling/importance_sampling_ratio/mean': 0.9999423027038574, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9577538818120956, 'clip_ratio/low_mean': 3.1114799753595435e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3945113980516908e-06, 'clip_ratio/high_max': 9.578045592206763e-06, 'clip_ratio/region_mean': 3.350931149270764e-05, 'epoch': 0.32} + + 34%|███▍ | 350/1024 [15:57:41<28:58:05, 154.73s/it]INFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:22:41 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 351/1024 [16:00:17<29:00:01, 155.13s/it] + {'loss': 0.06, 'grad_norm': 0.0030849494505673647, 'learning_rate': 1e-05, 'num_tokens': 306258023.0, 'completions/mean_length': 6197.5703125, 'completions/min_length': 316.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6035.88134765625, 'completions/min_terminated_length': 316.0, 'completions/max_terminated_length': 15670.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.3748064339160919, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021017421036958694, 'sampling/sampling_logp_difference/max': 7.093727111816406, 'sampling/importance_sampling_ratio/min': 0.000830297009088099, 'sampling/importance_sampling_ratio/mean': 0.9998056888580322, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8665244281291962, 'clip_ratio/low_mean': 4.784364205079328e-05, 'clip_ratio/low_min': 3.861600362142781e-06, 'clip_ratio/high_mean': 2.7257655688117666e-06, 'clip_ratio/high_max': 1.0903062275247066e-05, 'clip_ratio/region_mean': 5.056940744907479e-05, 'epoch': 0.32} + + 34%|███▍ | 351/1024 [16:00:17<29:00:01, 155.13s/it]INFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:25:17 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 352/1024 [16:03:08<29:51:20, 159.94s/it] + {'loss': 0.076, 'grad_norm': 0.002946985885500908, 'learning_rate': 1e-05, 'num_tokens': 307240305.0, 'completions/mean_length': 7522.578125, 'completions/min_length': 794.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7381.9208984375, 'completions/min_terminated_length': 794.0, 'completions/max_terminated_length': 16276.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01965932548046112, 'sampling/sampling_logp_difference/max': 5.273195743560791, 'sampling/importance_sampling_ratio/min': 0.005127199459820986, 'sampling/importance_sampling_ratio/mean': 0.9999547004699707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8185881152749062, 'clip_ratio/low_mean': 6.213493452378316e-05, 'clip_ratio/low_min': 1.0056635801447555e-05, 'clip_ratio/high_mean': 4.3520980170796975e-06, 'clip_ratio/high_max': 1.4299712574938894e-05, 'clip_ratio/region_mean': 6.648703174505499e-05, 'epoch': 0.32} + + 34%|███▍ | 352/1024 [16:03:08<29:51:20, 159.94s/it]INFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:28:08 [block_pool.py:292] Successfully reset prefix cache + + 34%|███▍ | 353/1024 [16:05:41<29:24:17, 157.76s/it] + {'loss': 0.072, 'grad_norm': 0.0031181599479168653, 'learning_rate': 1e-05, 'num_tokens': 308079318.0, 'completions/mean_length': 6403.2265625, 'completions/min_length': 552.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6163.6884765625, 'completions/min_terminated_length': 552.0, 'completions/max_terminated_length': 14090.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.27145031094551086, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01963040418922901, 'sampling/sampling_logp_difference/max': 9.605287551879883, 'sampling/importance_sampling_ratio/min': 6.73715621815063e-05, 'sampling/importance_sampling_ratio/mean': 0.9999215602874756, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8359840363264084, 'clip_ratio/low_mean': 4.2052345861520735e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.08456730585749e-06, 'clip_ratio/high_max': 1.693051035545068e-05, 'clip_ratio/region_mean': 4.713691282631771e-05, 'epoch': 0.32} + + 34%|███▍ | 353/1024 [16:05:41<29:24:17, 157.76s/it]INFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:30:40 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▍ | 354/1024 [16:08:40<30:32:35, 164.11s/it] + {'loss': 0.1022, 'grad_norm': 0.002656223252415657, 'learning_rate': 1e-05, 'num_tokens': 309117770.0, 'completions/mean_length': 7954.03125, 'completions/min_length': 632.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7751.71240234375, 'completions/min_terminated_length': 632.0, 'completions/max_terminated_length': 16148.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.321655809879303, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020741507411003113, 'sampling/sampling_logp_difference/max': 7.999940395355225, 'sampling/importance_sampling_ratio/min': 0.0003354826185386628, 'sampling/importance_sampling_ratio/mean': 0.9999536275863647, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.905990719795227, 'clip_ratio/low_mean': 6.722658486069122e-05, 'clip_ratio/low_min': 1.858519090092159e-05, 'clip_ratio/high_mean': 3.497229783988587e-06, 'clip_ratio/high_max': 1.3988919135954347e-05, 'clip_ratio/region_mean': 7.072381458783639e-05, 'epoch': 0.33} + + 35%|███▍ | 354/1024 [16:08:40<30:32:35, 164.11s/it]INFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:33:39 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▍ | 355/1024 [16:11:34<31:03:15, 167.11s/it] + {'loss': 0.0617, 'grad_norm': 0.0060529084876179695, 'learning_rate': 1e-05, 'num_tokens': 309988894.0, 'completions/mean_length': 6630.09375, 'completions/min_length': 375.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6315.45166015625, 'completions/min_terminated_length': 375.0, 'completions/max_terminated_length': 16272.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2790592312812805, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02060208097100258, 'sampling/sampling_logp_difference/max': 10.716434478759766, 'sampling/importance_sampling_ratio/min': 2.2177453502081335e-05, 'sampling/importance_sampling_ratio/mean': 0.9998822212219238, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.870736837387085, 'clip_ratio/low_mean': 4.337988764291367e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.402648755785776e-06, 'clip_ratio/high_max': 1.7610595023143105e-05, 'clip_ratio/region_mean': 4.778253651238629e-05, 'epoch': 0.33} + + 35%|███▍ | 355/1024 [16:11:34<31:03:15, 167.11s/it]INFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:36:33 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-02 05:38:09,170 - math_verify.grader - WARNING - Timeout during comparison + + 35%|███▍ | 356/1024 [16:14:04<30:05:25, 162.16s/it] + {'loss': 0.0605, 'grad_norm': 0.00400698184967041, 'learning_rate': 1e-05, 'num_tokens': 310864013.0, 'completions/mean_length': 6679.6171875, 'completions/min_length': 611.0, 'completions/max_length': 15920.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6679.6171875, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 15920.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3295465111732483, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02098071575164795, 'sampling/sampling_logp_difference/max': 6.1853485107421875, 'sampling/importance_sampling_ratio/min': 0.0020593837834894657, 'sampling/importance_sampling_ratio/mean': 0.9999049305915833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9812518879771233, 'clip_ratio/low_mean': 3.1030769946482906e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6120233087567613e-06, 'clip_ratio/high_max': 1.0448093235027045e-05, 'clip_ratio/region_mean': 3.364279325523967e-05, 'epoch': 0.33} + + 35%|███▍ | 356/1024 [16:14:04<30:05:25, 162.16s/it]INFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:39:04 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▍ | 357/1024 [16:16:39<29:36:07, 159.77s/it] + {'loss': 0.0362, 'grad_norm': 0.005929585546255112, 'learning_rate': 1e-05, 'num_tokens': 311589987.0, 'completions/mean_length': 5523.796875, 'completions/min_length': 633.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5173.4677734375, 'completions/min_terminated_length': 633.0, 'completions/max_terminated_length': 14541.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019948206841945648, 'sampling/sampling_logp_difference/max': 6.843744277954102, 'sampling/importance_sampling_ratio/min': 0.0010661041596904397, 'sampling/importance_sampling_ratio/mean': 0.9998446702957153, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9120645374059677, 'clip_ratio/low_mean': 2.900951585615985e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.736592579021817e-06, 'clip_ratio/high_max': 2.124982574969181e-05, 'clip_ratio/region_mean': 3.674610888992902e-05, 'epoch': 0.33} + + 35%|███▍ | 357/1024 [16:16:39<29:36:07, 159.77s/it]INFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:41:38 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▍ | 358/1024 [16:19:07<28:54:07, 156.23s/it] + {'loss': 0.1023, 'grad_norm': 0.006622390355914831, 'learning_rate': 1e-05, 'num_tokens': 312424034.0, 'completions/mean_length': 6361.3671875, 'completions/min_length': 432.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6282.44873046875, 'completions/min_terminated_length': 432.0, 'completions/max_terminated_length': 15401.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3724474310874939, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018907658755779266, 'sampling/sampling_logp_difference/max': 8.060688972473145, 'sampling/importance_sampling_ratio/min': 0.0003157092141918838, 'sampling/importance_sampling_ratio/mean': 1.0000219345092773, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8044678047299385, 'clip_ratio/low_mean': 5.346400575945154e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.219769085826556e-06, 'clip_ratio/high_max': 2.4486997745043482e-05, 'clip_ratio/region_mean': 6.168377467474784e-05, 'epoch': 0.33} + + 35%|███▍ | 358/1024 [16:19:07<28:54:07, 156.23s/it]INFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:44:06 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▌ | 359/1024 [16:21:47<29:03:51, 157.34s/it] + {'loss': 0.0861, 'grad_norm': 0.004639944992959499, 'learning_rate': 1e-05, 'num_tokens': 313353346.0, 'completions/mean_length': 7109.0, 'completions/min_length': 611.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7035.96826171875, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 15883.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3826971650123596, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.02155841514468193, 'sampling/sampling_logp_difference/max': 6.262202262878418, 'sampling/importance_sampling_ratio/min': 0.0019070414127781987, 'sampling/importance_sampling_ratio/mean': 0.9999389052391052, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9167275875806808, 'clip_ratio/low_mean': 5.925514369664597e-05, 'clip_ratio/low_min': 1.3324347946763737e-05, 'clip_ratio/high_mean': 2.6018441872110998e-06, 'clip_ratio/high_max': 1.0407376748844399e-05, 'clip_ratio/region_mean': 6.185698703120579e-05, 'epoch': 0.33} + + 35%|███▌ | 359/1024 [16:21:47<29:03:51, 157.34s/it]INFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:46:46 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▌ | 360/1024 [16:24:35<29:37:36, 160.63s/it] + {'loss': 0.0873, 'grad_norm': 0.007643720600754023, 'learning_rate': 1e-05, 'num_tokens': 314180717.0, 'completions/mean_length': 6314.2734375, 'completions/min_length': 665.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6072.60009765625, 'completions/min_terminated_length': 665.0, 'completions/max_terminated_length': 15795.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.28117600083351135, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01964358240365982, 'sampling/sampling_logp_difference/max': 3.8497378826141357, 'sampling/importance_sampling_ratio/min': 0.021285315975546837, 'sampling/importance_sampling_ratio/mean': 0.9999802112579346, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8780038207769394, 'clip_ratio/low_mean': 3.3944450819944905e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0912523691786191e-05, 'clip_ratio/high_max': 3.959046694035351e-05, 'clip_ratio/region_mean': 4.485697365907981e-05, 'epoch': 0.33} + + 35%|███▌ | 360/1024 [16:24:35<29:37:36, 160.63s/it]INFO 12-02 05:49:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:49:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:49:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:49:35 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▌ | 361/1024 [16:27:25<30:06:26, 163.48s/it] + {'loss': 0.0109, 'grad_norm': 0.0050973957404494286, 'learning_rate': 1e-05, 'num_tokens': 315060842.0, 'completions/mean_length': 6718.2265625, 'completions/min_length': 505.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6486.24853515625, 'completions/min_terminated_length': 505.0, 'completions/max_terminated_length': 16167.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3521803915500641, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019501537084579468, 'sampling/sampling_logp_difference/max': 6.998699188232422, 'sampling/importance_sampling_ratio/min': 0.0009130688849836588, 'sampling/importance_sampling_ratio/mean': 1.000014066696167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8326799497008324, 'clip_ratio/low_mean': 4.137891801292426e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.187473835936544e-06, 'clip_ratio/high_max': 3.065382111344661e-05, 'clip_ratio/region_mean': 5.056639065514901e-05, 'epoch': 0.33} + + 35%|███▌ | 361/1024 [16:27:25<30:06:26, 163.48s/it]INFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:52:25 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▌ | 362/1024 [16:30:26<31:02:10, 168.78s/it] + {'loss': 0.036, 'grad_norm': 0.0019092690199613571, 'learning_rate': 1e-05, 'num_tokens': 316190325.0, 'completions/mean_length': 8666.8359375, 'completions/min_length': 565.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7941.291015625, 'completions/min_terminated_length': 565.0, 'completions/max_terminated_length': 16128.0, 'rewards/accuracy_reward/mean': 0.234375, 'rewards/accuracy_reward/std': 0.42527204751968384, 'reward': 0.234375, 'reward_std': 0.2022808939218521, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02051631174981594, 'sampling/sampling_logp_difference/max': 10.249995231628418, 'sampling/importance_sampling_ratio/min': 3.5357668821234256e-05, 'sampling/importance_sampling_ratio/mean': 0.9999814629554749, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9526705741882324, 'clip_ratio/low_mean': 1.8797969062234188e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.156213440684951e-06, 'clip_ratio/high_max': 8.624853762739804e-06, 'clip_ratio/region_mean': 2.0954182048171788e-05, 'epoch': 0.33} + + 35%|███▌ | 362/1024 [16:30:26<31:02:10, 168.78s/it]INFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:55:26 [block_pool.py:292] Successfully reset prefix cache + + 35%|███▌ | 363/1024 [16:33:23<31:27:20, 171.32s/it] + {'loss': 0.0729, 'grad_norm': 0.0019530428107827902, 'learning_rate': 1e-05, 'num_tokens': 317191878.0, 'completions/mean_length': 7661.8203125, 'completions/min_length': 649.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7002.16015625, 'completions/min_terminated_length': 649.0, 'completions/max_terminated_length': 15164.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.21382391452789307, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019894573837518692, 'sampling/sampling_logp_difference/max': 9.367389678955078, 'sampling/importance_sampling_ratio/min': 8.546619210392237e-05, 'sampling/importance_sampling_ratio/mean': 0.9999173879623413, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8322782590985298, 'clip_ratio/low_mean': 3.521234066283796e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.434908300434472e-06, 'clip_ratio/high_max': 2.147400391550036e-05, 'clip_ratio/region_mean': 4.164724816746457e-05, 'epoch': 0.33} + + 35%|███▌ | 363/1024 [16:33:23<31:27:20, 171.32s/it]INFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 05:58:23 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 364/1024 [16:36:26<32:01:19, 174.67s/it] + {'loss': 0.0391, 'grad_norm': 0.0031784537713974714, 'learning_rate': 1e-05, 'num_tokens': 318109004.0, 'completions/mean_length': 7024.859375, 'completions/min_length': 693.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6800.240234375, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 15934.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.31800347566604614, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018519200384616852, 'sampling/sampling_logp_difference/max': 8.124353408813477, 'sampling/importance_sampling_ratio/min': 0.0002962362195830792, 'sampling/importance_sampling_ratio/mean': 0.9999352693557739, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.794853538274765, 'clip_ratio/low_mean': 4.2698405422925134e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.089704697842535e-06, 'clip_ratio/high_max': 1.9436202364886412e-05, 'clip_ratio/region_mean': 4.878810955233348e-05, 'epoch': 0.33} + + 36%|███▌ | 364/1024 [16:36:26<32:01:19, 174.67s/it]INFO 12-02 06:01:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:01:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:01:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:01:26 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 365/1024 [16:39:09<31:21:59, 171.35s/it] + {'loss': 0.041, 'grad_norm': 0.005080109462141991, 'learning_rate': 1e-05, 'num_tokens': 319059075.0, 'completions/mean_length': 7282.1796875, 'completions/min_length': 870.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6912.1865234375, 'completions/min_terminated_length': 870.0, 'completions/max_terminated_length': 15624.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.26539456844329834, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019978653639554977, 'sampling/sampling_logp_difference/max': 6.136754989624023, 'sampling/importance_sampling_ratio/min': 0.1194523349404335, 'sampling/importance_sampling_ratio/mean': 1.000062108039856, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.904067650437355, 'clip_ratio/low_mean': 4.342453667049995e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0318639169781818e-06, 'clip_ratio/high_max': 4.127455667912727e-06, 'clip_ratio/region_mean': 4.445640047379129e-05, 'epoch': 0.34} + + 36%|███▌ | 365/1024 [16:39:09<31:21:59, 171.35s/it]INFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:04:09 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 366/1024 [16:42:21<32:24:49, 177.34s/it] + {'loss': 0.0699, 'grad_norm': 0.0022667953744530678, 'learning_rate': 1e-05, 'num_tokens': 319990046.0, 'completions/mean_length': 7131.5234375, 'completions/min_length': 373.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6596.255859375, 'completions/min_terminated_length': 373.0, 'completions/max_terminated_length': 15625.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.30221715569496155, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02037571743130684, 'sampling/sampling_logp_difference/max': 3.294381618499756, 'sampling/importance_sampling_ratio/min': 0.0370909757912159, 'sampling/importance_sampling_ratio/mean': 0.9999264478683472, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8849587142467499, 'clip_ratio/low_mean': 2.608940076243016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.608940076243016e-05, 'epoch': 0.34} + + 36%|███▌ | 366/1024 [16:42:21<32:24:49, 177.34s/it]INFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:07:20 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 367/1024 [16:45:11<31:56:51, 175.06s/it] + {'loss': 0.0681, 'grad_norm': 0.00216497085057199, 'learning_rate': 1e-05, 'num_tokens': 320860135.0, 'completions/mean_length': 6655.4453125, 'completions/min_length': 378.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6578.84228515625, 'completions/min_terminated_length': 378.0, 'completions/max_terminated_length': 16205.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.3369230031967163, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01771342009305954, 'sampling/sampling_logp_difference/max': 7.563511371612549, 'sampling/importance_sampling_ratio/min': 0.0005190494703128934, 'sampling/importance_sampling_ratio/mean': 0.9999319314956665, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7417122721672058, 'clip_ratio/low_mean': 3.4950805911648786e-05, 'clip_ratio/low_min': 4.876336333836662e-06, 'clip_ratio/high_mean': 3.839158978280466e-06, 'clip_ratio/high_max': 1.5356635913121863e-05, 'clip_ratio/region_mean': 3.8789965287833184e-05, 'epoch': 0.34} + + 36%|███▌ | 367/1024 [16:45:11<31:56:51, 175.06s/it]INFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:10:10 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 368/1024 [16:47:58<31:29:13, 172.79s/it] + {'loss': 0.0643, 'grad_norm': 0.0028338562697172165, 'learning_rate': 1e-05, 'num_tokens': 321783852.0, 'completions/mean_length': 7077.1640625, 'completions/min_length': 26.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6619.45068359375, 'completions/min_terminated_length': 26.0, 'completions/max_terminated_length': 15849.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2120065838098526, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020299233496189117, 'sampling/sampling_logp_difference/max': 11.757177352905273, 'sampling/importance_sampling_ratio/min': 7.83290306571871e-06, 'sampling/importance_sampling_ratio/mean': 0.9998220205307007, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8749325424432755, 'clip_ratio/low_mean': 5.688933060810086e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.297029474604642e-06, 'clip_ratio/high_max': 1.7605634639039636e-05, 'clip_ratio/region_mean': 6.218636053745286e-05, 'epoch': 0.34} + + 36%|███▌ | 368/1024 [16:47:58<31:29:13, 172.79s/it]INFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:12:58 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 369/1024 [16:50:37<30:40:50, 168.63s/it] + {'loss': 0.0275, 'grad_norm': 0.0022897711023688316, 'learning_rate': 1e-05, 'num_tokens': 322572882.0, 'completions/mean_length': 6034.296875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5525.294921875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 15329.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.2756394147872925, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01858348958194256, 'sampling/sampling_logp_difference/max': 7.7979736328125, 'sampling/importance_sampling_ratio/min': 0.0004105660773348063, 'sampling/importance_sampling_ratio/mean': 0.9999347925186157, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.80014718323946, 'clip_ratio/low_mean': 5.158197632226802e-05, 'clip_ratio/low_min': 3.735804057214409e-06, 'clip_ratio/high_mean': 1.8254570477438392e-06, 'clip_ratio/high_max': 7.301828190975357e-06, 'clip_ratio/region_mean': 5.340743223314348e-05, 'epoch': 0.34} + + 36%|███▌ | 369/1024 [16:50:37<30:40:50, 168.63s/it]INFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:15:37 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 370/1024 [16:53:25<30:37:29, 168.58s/it] + {'loss': 0.0356, 'grad_norm': 0.003263789461925626, 'learning_rate': 1e-05, 'num_tokens': 323640904.0, 'completions/mean_length': 8172.109375, 'completions/min_length': 733.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7838.29248046875, 'completions/min_terminated_length': 733.0, 'completions/max_terminated_length': 15948.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.3237774670124054, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0208889190107584, 'sampling/sampling_logp_difference/max': 11.588455200195312, 'sampling/importance_sampling_ratio/min': 9.27252222027164e-06, 'sampling/importance_sampling_ratio/mean': 0.9999354481697083, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8732693120837212, 'clip_ratio/low_mean': 4.186752630630508e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.351393047523743e-06, 'clip_ratio/high_max': 9.364057859784225e-06, 'clip_ratio/region_mean': 4.5218919240141986e-05, 'epoch': 0.34} + + 36%|███▌ | 370/1024 [16:53:25<30:37:29, 168.58s/it]INFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:18:25 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▌ | 371/1024 [16:56:28<31:20:32, 172.79s/it] + {'loss': 0.0937, 'grad_norm': 0.0042716520838439465, 'learning_rate': 1e-05, 'num_tokens': 324643858.0, 'completions/mean_length': 7699.203125, 'completions/min_length': 1225.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7419.04833984375, 'completions/min_terminated_length': 1225.0, 'completions/max_terminated_length': 16228.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3090519607067108, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018926654011011124, 'sampling/sampling_logp_difference/max': 8.413164138793945, 'sampling/importance_sampling_ratio/min': 0.00022192654432728887, 'sampling/importance_sampling_ratio/mean': 0.9999874234199524, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8296505436301231, 'clip_ratio/low_mean': 4.261424010110204e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.692962131182867e-06, 'clip_ratio/high_max': 2.0998899799451465e-05, 'clip_ratio/region_mean': 4.930720297124935e-05, 'epoch': 0.34} + + 36%|███▌ | 371/1024 [16:56:28<31:20:32, 172.79s/it]INFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:21:28 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▋ | 372/1024 [16:59:17<31:06:11, 171.74s/it] + {'loss': 0.0424, 'grad_norm': 0.0033558050636202097, 'learning_rate': 1e-05, 'num_tokens': 325617687.0, 'completions/mean_length': 7450.1640625, 'completions/min_length': 910.0, 'completions/max_length': 16364.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7450.1640625, 'completions/min_terminated_length': 910.0, 'completions/max_terminated_length': 16364.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.27222445607185364, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02249298244714737, 'sampling/sampling_logp_difference/max': 3.2208595275878906, 'sampling/importance_sampling_ratio/min': 0.039920732378959656, 'sampling/importance_sampling_ratio/mean': 0.9999459385871887, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0400195196270943, 'clip_ratio/low_mean': 4.5005243464402156e-05, 'clip_ratio/low_min': 3.861838649754645e-06, 'clip_ratio/high_mean': 1.765337287906732e-06, 'clip_ratio/high_max': 7.061349151626928e-06, 'clip_ratio/region_mean': 4.6770580411248375e-05, 'epoch': 0.34} + + 36%|███▋ | 372/1024 [16:59:17<31:06:11, 171.74s/it]INFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:24:17 [block_pool.py:292] Successfully reset prefix cache + + 36%|███▋ | 373/1024 [17:01:52<30:07:56, 166.63s/it] + {'loss': 0.0476, 'grad_norm': 0.005797459278255701, 'learning_rate': 1e-05, 'num_tokens': 326508384.0, 'completions/mean_length': 6799.0703125, 'completions/min_length': 1708.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6723.5986328125, 'completions/min_terminated_length': 1708.0, 'completions/max_terminated_length': 15342.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021543748676776886, 'sampling/sampling_logp_difference/max': 14.0984525680542, 'sampling/importance_sampling_ratio/min': 7.535634836131067e-07, 'sampling/importance_sampling_ratio/mean': 0.9999321699142456, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9737623482942581, 'clip_ratio/low_mean': 2.4451034505545977e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2869506867427845e-06, 'clip_ratio/high_max': 1.3147802746971138e-05, 'clip_ratio/region_mean': 2.7737984851228248e-05, 'epoch': 0.34} + + 36%|███▋ | 373/1024 [17:01:52<30:07:56, 166.63s/it]INFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:26:52 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 374/1024 [17:04:39<30:07:25, 166.84s/it] + {'loss': 0.0389, 'grad_norm': 0.002258980879560113, 'learning_rate': 1e-05, 'num_tokens': 327426407.0, 'completions/mean_length': 7034.3671875, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6654.30078125, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 16102.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.19674429297447205, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01997346058487892, 'sampling/sampling_logp_difference/max': 4.742221832275391, 'sampling/importance_sampling_ratio/min': 0.008719252422451973, 'sampling/importance_sampling_ratio/mean': 0.9999661445617676, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8749603256583214, 'clip_ratio/low_mean': 2.3457610382138228e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.398505997320171e-07, 'clip_ratio/high_max': 3.3594023989280686e-06, 'clip_ratio/region_mean': 2.4297460981870245e-05, 'epoch': 0.34} + + 37%|███▋ | 374/1024 [17:04:39<30:07:25, 166.84s/it]INFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:29:39 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 375/1024 [17:07:15<29:29:56, 163.63s/it] + {'loss': 0.076, 'grad_norm': 0.002420129720121622, 'learning_rate': 1e-05, 'num_tokens': 328292985.0, 'completions/mean_length': 6623.078125, 'completions/min_length': 569.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6388.81640625, 'completions/min_terminated_length': 569.0, 'completions/max_terminated_length': 15240.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3077537417411804, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019645996391773224, 'sampling/sampling_logp_difference/max': 8.811544418334961, 'sampling/importance_sampling_ratio/min': 0.00014900295354891568, 'sampling/importance_sampling_ratio/mean': 0.9998596906661987, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.858784057199955, 'clip_ratio/low_mean': 4.9395109726901865e-05, 'clip_ratio/low_min': 1.636556044104509e-05, 'clip_ratio/high_mean': 7.058438370677322e-06, 'clip_ratio/high_max': 2.823375348270929e-05, 'clip_ratio/region_mean': 5.6453548268109444e-05, 'epoch': 0.34} + + 37%|███▋ | 375/1024 [17:07:15<29:29:56, 163.63s/it]INFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:32:15 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 376/1024 [17:09:54<29:11:51, 162.21s/it] + {'loss': 0.0824, 'grad_norm': 0.004107976797968149, 'learning_rate': 1e-05, 'num_tokens': 329067006.0, 'completions/mean_length': 5902.4765625, 'completions/min_length': 574.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5564.36279296875, 'completions/min_terminated_length': 574.0, 'completions/max_terminated_length': 15229.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3945493996143341, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019582755863666534, 'sampling/sampling_logp_difference/max': 11.37439250946045, 'sampling/importance_sampling_ratio/min': 1.1485875802463852e-05, 'sampling/importance_sampling_ratio/mean': 0.9999526143074036, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.904740035533905, 'clip_ratio/low_mean': 4.051302585139638e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.468551191574079e-06, 'clip_ratio/high_max': 1.8078507309837732e-05, 'clip_ratio/region_mean': 4.698157727034413e-05, 'epoch': 0.35} + + 37%|███▋ | 376/1024 [17:09:54<29:11:51, 162.21s/it]INFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:34:54 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 377/1024 [17:12:33<28:58:11, 161.19s/it] + {'loss': 0.0164, 'grad_norm': 0.003208522219210863, 'learning_rate': 1e-05, 'num_tokens': 329910691.0, 'completions/mean_length': 6425.6015625, 'completions/min_length': 557.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6267.5322265625, 'completions/min_terminated_length': 557.0, 'completions/max_terminated_length': 14514.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021154657006263733, 'sampling/sampling_logp_difference/max': 6.588794231414795, 'sampling/importance_sampling_ratio/min': 0.00137569778598845, 'sampling/importance_sampling_ratio/mean': 0.9999419450759888, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.964553713798523, 'clip_ratio/low_mean': 1.7552573126522475e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.276365181496658e-06, 'clip_ratio/high_max': 2.553658168835682e-05, 'clip_ratio/region_mean': 2.482893796695862e-05, 'epoch': 0.35} + + 37%|███▋ | 377/1024 [17:12:33<28:58:11, 161.19s/it]INFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:37:33 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 378/1024 [17:15:32<29:51:26, 166.39s/it] + {'loss': 0.0815, 'grad_norm': 0.002898421371355653, 'learning_rate': 1e-05, 'num_tokens': 330956332.0, 'completions/mean_length': 8006.4453125, 'completions/min_length': 1235.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7594.43408203125, 'completions/min_terminated_length': 1235.0, 'completions/max_terminated_length': 15797.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.20175684988498688, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021021340042352676, 'sampling/sampling_logp_difference/max': 9.27452278137207, 'sampling/importance_sampling_ratio/min': 9.378339746035635e-05, 'sampling/importance_sampling_ratio/mean': 0.9998818635940552, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8980336412787437, 'clip_ratio/low_mean': 4.0991827404468495e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7178105053972104e-06, 'clip_ratio/high_max': 6.8712420215888415e-06, 'clip_ratio/region_mean': 4.2709637853022286e-05, 'epoch': 0.35} + + 37%|███▋ | 378/1024 [17:15:32<29:51:26, 166.39s/it]INFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:40:31 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 379/1024 [17:18:11<29:25:21, 164.22s/it] + {'loss': 0.0313, 'grad_norm': 0.0037063576746731997, 'learning_rate': 1e-05, 'num_tokens': 331880918.0, 'completions/mean_length': 7068.828125, 'completions/min_length': 791.0, 'completions/max_length': 15484.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7068.828125, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 15484.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.17859892547130585, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02072504535317421, 'sampling/sampling_logp_difference/max': 8.611893653869629, 'sampling/importance_sampling_ratio/min': 0.0001819290773710236, 'sampling/importance_sampling_ratio/mean': 0.9999452829360962, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9865007549524307, 'clip_ratio/low_mean': 2.2689344689297286e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.2689344689297286e-05, 'epoch': 0.35} + + 37%|███▋ | 379/1024 [17:18:11<29:25:21, 164.22s/it]INFO 12-02 06:43:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:43:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:43:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:43:11 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 380/1024 [17:21:07<30:00:25, 167.74s/it] + {'loss': 0.0228, 'grad_norm': 0.001972826896235347, 'learning_rate': 1e-05, 'num_tokens': 332849112.0, 'completions/mean_length': 7379.390625, 'completions/min_length': 738.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7236.4609375, 'completions/min_terminated_length': 738.0, 'completions/max_terminated_length': 16281.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.28247418999671936, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019411223009228706, 'sampling/sampling_logp_difference/max': 10.476028442382812, 'sampling/importance_sampling_ratio/min': 2.820451663865242e-05, 'sampling/importance_sampling_ratio/mean': 0.999925971031189, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8977236375212669, 'clip_ratio/low_mean': 3.207486906831036e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4614083170272352e-06, 'clip_ratio/high_max': 5.845633268108941e-06, 'clip_ratio/region_mean': 3.353627721480734e-05, 'epoch': 0.35} + + 37%|███▋ | 380/1024 [17:21:07<30:00:25, 167.74s/it]INFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:46:06 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 381/1024 [17:23:46<29:30:44, 165.23s/it] + {'loss': 0.0495, 'grad_norm': 0.006926023401319981, 'learning_rate': 1e-05, 'num_tokens': 333746179.0, 'completions/mean_length': 6867.9609375, 'completions/min_length': 760.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6793.03125, 'completions/min_terminated_length': 760.0, 'completions/max_terminated_length': 15517.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.1433562934398651, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.020311862230300903, 'sampling/sampling_logp_difference/max': 7.8556413650512695, 'sampling/importance_sampling_ratio/min': 0.0003875594411510974, 'sampling/importance_sampling_ratio/mean': 0.9999299645423889, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9244343340396881, 'clip_ratio/low_mean': 2.3530714997832547e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2188462505946518e-06, 'clip_ratio/high_max': 4.875385002378607e-06, 'clip_ratio/region_mean': 2.47495612484272e-05, 'epoch': 0.35} + + 37%|███▋ | 381/1024 [17:23:46<29:30:44, 165.23s/it]INFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:48:46 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 382/1024 [17:26:35<29:38:12, 166.19s/it] + {'loss': 0.0808, 'grad_norm': 0.0047226278111338615, 'learning_rate': 1e-05, 'num_tokens': 334731027.0, 'completions/mean_length': 7525.375, 'completions/min_length': 654.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6855.3955078125, 'completions/min_terminated_length': 654.0, 'completions/max_terminated_length': 15900.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.3353874683380127, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021496692672371864, 'sampling/sampling_logp_difference/max': 8.119979858398438, 'sampling/importance_sampling_ratio/min': 0.00029753465787507594, 'sampling/importance_sampling_ratio/mean': 0.9999615550041199, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9207312315702438, 'clip_ratio/low_mean': 5.268017821435933e-05, 'clip_ratio/low_min': 3.950945028918795e-06, 'clip_ratio/high_mean': 4.836261211949022e-06, 'clip_ratio/high_max': 1.5651628245905158e-05, 'clip_ratio/region_mean': 5.751643902840442e-05, 'epoch': 0.35} + + 37%|███▋ | 382/1024 [17:26:35<29:38:12, 166.19s/it]INFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:51:34 [block_pool.py:292] Successfully reset prefix cache + + 37%|███▋ | 383/1024 [17:29:48<31:01:15, 174.22s/it] + {'loss': 0.0126, 'grad_norm': 0.004971448332071304, 'learning_rate': 1e-05, 'num_tokens': 335631243.0, 'completions/mean_length': 6841.0625, 'completions/min_length': 689.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6453.13818359375, 'completions/min_terminated_length': 689.0, 'completions/max_terminated_length': 16251.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2596156895160675, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020256079733371735, 'sampling/sampling_logp_difference/max': 11.547955513000488, 'sampling/importance_sampling_ratio/min': 9.655764188210014e-06, 'sampling/importance_sampling_ratio/mean': 0.999934196472168, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8979457840323448, 'clip_ratio/low_mean': 4.519663821156428e-05, 'clip_ratio/low_min': 2.775434040813707e-06, 'clip_ratio/high_mean': 9.53844971718354e-06, 'clip_ratio/high_max': 3.815379886873416e-05, 'clip_ratio/region_mean': 5.473508826980833e-05, 'epoch': 0.35} + + 37%|███▋ | 383/1024 [17:29:48<31:01:15, 174.22s/it]INFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:54:47 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 384/1024 [17:32:49<31:22:32, 176.49s/it] + {'loss': 0.0262, 'grad_norm': 0.0038604787550866604, 'learning_rate': 1e-05, 'num_tokens': 336537162.0, 'completions/mean_length': 6919.8046875, 'completions/min_length': 896.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6454.35205078125, 'completions/min_terminated_length': 896.0, 'completions/max_terminated_length': 15060.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2777610421180725, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02030865103006363, 'sampling/sampling_logp_difference/max': 6.999982833862305, 'sampling/importance_sampling_ratio/min': 0.0009118975722230971, 'sampling/importance_sampling_ratio/mean': 0.9998080730438232, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9241961911320686, 'clip_ratio/low_mean': 3.1563491688757495e-05, 'clip_ratio/low_min': 3.1228139505401487e-06, 'clip_ratio/high_mean': 1.0405914281363948e-06, 'clip_ratio/high_max': 4.162365712545579e-06, 'clip_ratio/region_mean': 3.260408311689389e-05, 'epoch': 0.35} + + 38%|███▊ | 384/1024 [17:32:49<31:22:32, 176.49s/it]INFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 06:57:51 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 38%|███▊ | 385/1024 [17:36:02<32:11:34, 181.37s/it] + {'loss': 0.0849, 'grad_norm': 0.004624314606189728, 'learning_rate': 1e-05, 'num_tokens': 337542492.0, 'completions/mean_length': 7679.390625, 'completions/min_length': 105.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7099.08349609375, 'completions/min_terminated_length': 105.0, 'completions/max_terminated_length': 15692.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.2517249882221222, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02206476218998432, 'sampling/sampling_logp_difference/max': 9.748971939086914, 'sampling/importance_sampling_ratio/min': 5.83546279813163e-05, 'sampling/importance_sampling_ratio/mean': 0.9999251961708069, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0165777206420898, 'clip_ratio/low_mean': 4.3847362121596234e-05, 'clip_ratio/low_min': 6.294533704931382e-06, 'clip_ratio/high_mean': 1.6295562090817839e-06, 'clip_ratio/high_max': 6.5182248363271356e-06, 'clip_ratio/region_mean': 4.547691833067802e-05, 'epoch': 0.35} + + 38%|███▊ | 385/1024 [17:36:02<32:11:34, 181.37s/it]INFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:01:02 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 386/1024 [17:38:35<30:36:12, 172.68s/it] + {'loss': 0.0789, 'grad_norm': 0.0021966886706650257, 'learning_rate': 1e-05, 'num_tokens': 338324279.0, 'completions/mean_length': 5957.5859375, 'completions/min_length': 1705.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5792.08740234375, 'completions/min_terminated_length': 1705.0, 'completions/max_terminated_length': 15819.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.22962790727615356, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01804077997803688, 'sampling/sampling_logp_difference/max': 7.125762462615967, 'sampling/importance_sampling_ratio/min': 0.0008041196851991117, 'sampling/importance_sampling_ratio/mean': 0.999998927116394, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7705951780080795, 'clip_ratio/low_mean': 3.392923713363416e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5012490166554926e-06, 'clip_ratio/high_max': 6.00499606662197e-06, 'clip_ratio/region_mean': 3.543048615028965e-05, 'epoch': 0.36} + + 38%|███▊ | 386/1024 [17:38:35<30:36:12, 172.68s/it]INFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:03:34 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 387/1024 [17:41:11<29:41:39, 167.82s/it] + {'loss': 0.134, 'grad_norm': 0.001694107661023736, 'learning_rate': 1e-05, 'num_tokens': 339274662.0, 'completions/mean_length': 7269.8046875, 'completions/min_length': 892.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7198.03955078125, 'completions/min_terminated_length': 892.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.30487072467803955, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021742526441812515, 'sampling/sampling_logp_difference/max': 6.4581451416015625, 'sampling/importance_sampling_ratio/min': 0.0015677008777856827, 'sampling/importance_sampling_ratio/mean': 0.9999039769172668, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0025205165147781, 'clip_ratio/low_mean': 5.276240381135722e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.927837553874269e-06, 'clip_ratio/high_max': 1.5711350215497077e-05, 'clip_ratio/region_mean': 5.669024130838807e-05, 'epoch': 0.36} + + 38%|███▊ | 387/1024 [17:41:11<29:41:39, 167.82s/it]INFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:06:11 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 388/1024 [17:44:10<30:15:10, 171.24s/it] + {'loss': 0.0691, 'grad_norm': 0.004587972536683083, 'learning_rate': 1e-05, 'num_tokens': 340272689.0, 'completions/mean_length': 7643.8359375, 'completions/min_length': 1061.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7288.54443359375, 'completions/min_terminated_length': 1061.0, 'completions/max_terminated_length': 15755.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.35324612259864807, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01862112432718277, 'sampling/sampling_logp_difference/max': 7.210168361663818, 'sampling/importance_sampling_ratio/min': 0.0007390327518805861, 'sampling/importance_sampling_ratio/mean': 0.9999613761901855, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7936615869402885, 'clip_ratio/low_mean': 5.100632029098051e-05, 'clip_ratio/low_min': 8.934973720897688e-06, 'clip_ratio/high_mean': 1.7514622072667407e-06, 'clip_ratio/high_max': 7.005848829066963e-06, 'clip_ratio/region_mean': 5.275778244140383e-05, 'epoch': 0.36} + + 38%|███▊ | 388/1024 [17:44:10<30:15:10, 171.24s/it]INFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:09:10 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 389/1024 [17:46:58<30:02:05, 170.28s/it] + {'loss': 0.0636, 'grad_norm': 0.00245783943682909, 'learning_rate': 1e-05, 'num_tokens': 341195599.0, 'completions/mean_length': 7068.734375, 'completions/min_length': 775.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6610.60595703125, 'completions/min_terminated_length': 775.0, 'completions/max_terminated_length': 14401.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.21594557166099548, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019989900290966034, 'sampling/sampling_logp_difference/max': 11.090067863464355, 'sampling/importance_sampling_ratio/min': 1.526316918898374e-05, 'sampling/importance_sampling_ratio/mean': 0.999957263469696, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8858344480395317, 'clip_ratio/low_mean': 2.139122614153166e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6306840936740628e-06, 'clip_ratio/high_max': 1.0522736374696251e-05, 'clip_ratio/region_mean': 2.4021910121518886e-05, 'epoch': 0.36} + + 38%|███▊ | 389/1024 [17:46:58<30:02:05, 170.28s/it]INFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:11:58 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 390/1024 [17:49:42<29:38:40, 168.33s/it] + {'loss': 0.0181, 'grad_norm': 0.0067657483741641045, 'learning_rate': 1e-05, 'num_tokens': 341993565.0, 'completions/mean_length': 6093.296875, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5929.95263671875, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 15788.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.12415502220392227, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02037961222231388, 'sampling/sampling_logp_difference/max': 4.56026554107666, 'sampling/importance_sampling_ratio/min': 0.010459281504154205, 'sampling/importance_sampling_ratio/mean': 0.9998992681503296, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9640207663178444, 'clip_ratio/low_mean': 2.2939096254503966e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.318064846600464e-06, 'clip_ratio/high_max': 5.272259386401856e-06, 'clip_ratio/region_mean': 2.4257160987417592e-05, 'epoch': 0.36} + + 38%|███▊ | 390/1024 [17:49:42<29:38:40, 168.33s/it]INFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:14:42 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 391/1024 [17:52:24<29:17:06, 166.55s/it] + {'loss': 0.0306, 'grad_norm': 0.0018817185191437602, 'learning_rate': 1e-05, 'num_tokens': 342990545.0, 'completions/mean_length': 7620.09375, 'completions/min_length': 1076.0, 'completions/max_length': 16170.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7620.09375, 'completions/min_terminated_length': 1076.0, 'completions/max_terminated_length': 16170.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.18755048513412476, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021528441458940506, 'sampling/sampling_logp_difference/max': 7.281149864196777, 'sampling/importance_sampling_ratio/min': 0.0006883936002850533, 'sampling/importance_sampling_ratio/mean': 0.9999568462371826, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9773544892668724, 'clip_ratio/low_mean': 4.566248594528588e-05, 'clip_ratio/low_min': 4.402028480399167e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 4.566248594528588e-05, 'epoch': 0.36} + + 38%|███▊ | 391/1024 [17:52:24<29:17:06, 166.55s/it]INFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:17:24 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 392/1024 [17:55:13<29:21:04, 167.19s/it] + {'loss': 0.087, 'grad_norm': 0.0052104732021689415, 'learning_rate': 1e-05, 'num_tokens': 343898791.0, 'completions/mean_length': 6963.984375, 'completions/min_length': 646.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6737.904296875, 'completions/min_terminated_length': 646.0, 'completions/max_terminated_length': 15053.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3621976971626282, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.021434593945741653, 'sampling/sampling_logp_difference/max': 4.526732921600342, 'sampling/importance_sampling_ratio/min': 0.010815954767167568, 'sampling/importance_sampling_ratio/mean': 0.9999324679374695, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9683744385838509, 'clip_ratio/low_mean': 7.762144696243922e-05, 'clip_ratio/low_min': 2.4772080450929934e-05, 'clip_ratio/high_mean': 7.985045499481203e-06, 'clip_ratio/high_max': 2.6727505428425502e-05, 'clip_ratio/region_mean': 8.560649303035461e-05, 'epoch': 0.36} + + 38%|███▊ | 392/1024 [17:55:13<29:21:04, 167.19s/it]INFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:20:13 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 393/1024 [17:57:50<28:44:55, 164.02s/it] + {'loss': 0.0085, 'grad_norm': 0.005151392426341772, 'learning_rate': 1e-05, 'num_tokens': 344779672.0, 'completions/mean_length': 6718.5078125, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6642.4013671875, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 15116.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.251188188791275, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0201373603194952, 'sampling/sampling_logp_difference/max': 6.025149822235107, 'sampling/importance_sampling_ratio/min': 0.0024171893019229174, 'sampling/importance_sampling_ratio/mean': 0.999840497970581, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9043834507465363, 'clip_ratio/low_mean': 2.5377692509209737e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.365133804640209e-06, 'clip_ratio/high_max': 1.3545108686230378e-05, 'clip_ratio/region_mean': 2.9742826200163108e-05, 'epoch': 0.36} + + 38%|███▊ | 393/1024 [17:57:50<28:44:55, 164.02s/it]INFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:22:49 [block_pool.py:292] Successfully reset prefix cache + + 38%|███▊ | 394/1024 [18:00:30<28:29:30, 162.81s/it] + {'loss': 0.0554, 'grad_norm': 0.0026606651954352856, 'learning_rate': 1e-05, 'num_tokens': 345701722.0, 'completions/mean_length': 7044.640625, 'completions/min_length': 411.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6820.49609375, 'completions/min_terminated_length': 411.0, 'completions/max_terminated_length': 16342.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.24146249890327454, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01981864869594574, 'sampling/sampling_logp_difference/max': 10.157968521118164, 'sampling/importance_sampling_ratio/min': 3.8765938370488584e-05, 'sampling/importance_sampling_ratio/mean': 1.0000128746032715, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9017335474491119, 'clip_ratio/low_mean': 2.739263118201052e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.927679188109323e-06, 'clip_ratio/high_max': 1.2263486723895767e-05, 'clip_ratio/region_mean': 3.132031042696326e-05, 'epoch': 0.36} + + 38%|███▊ | 394/1024 [18:00:30<28:29:30, 162.81s/it]INFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:25:29 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▊ | 395/1024 [18:02:55<27:32:05, 157.59s/it] + {'loss': 0.0947, 'grad_norm': 0.003957017324864864, 'learning_rate': 1e-05, 'num_tokens': 346492810.0, 'completions/mean_length': 6031.875, 'completions/min_length': 520.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5950.3623046875, 'completions/min_terminated_length': 520.0, 'completions/max_terminated_length': 15476.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2858940362930298, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018711457028985023, 'sampling/sampling_logp_difference/max': 6.493460178375244, 'sampling/importance_sampling_ratio/min': 0.0015133036067709327, 'sampling/importance_sampling_ratio/mean': 0.9999707341194153, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8537683561444283, 'clip_ratio/low_mean': 4.819571529424138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.566390890024195e-06, 'clip_ratio/high_max': 1.026556356009678e-05, 'clip_ratio/region_mean': 5.0762106297952414e-05, 'epoch': 0.36} + + 39%|███▊ | 395/1024 [18:02:55<27:32:05, 157.59s/it]INFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:27:55 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▊ | 396/1024 [18:06:06<29:14:32, 167.63s/it] + {'loss': 0.1257, 'grad_norm': 0.002122648525983095, 'learning_rate': 1e-05, 'num_tokens': 347462871.0, 'completions/mean_length': 7429.3515625, 'completions/min_length': 1194.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6911.31396484375, 'completions/min_terminated_length': 1194.0, 'completions/max_terminated_length': 15942.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01998838409781456, 'sampling/sampling_logp_difference/max': 8.873497009277344, 'sampling/importance_sampling_ratio/min': 0.00014005196862854064, 'sampling/importance_sampling_ratio/mean': 1.0000076293945312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8821266070008278, 'clip_ratio/low_mean': 3.637038832948747e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4676222122034233e-06, 'clip_ratio/high_max': 5.870488848813693e-06, 'clip_ratio/region_mean': 3.783801014378696e-05, 'epoch': 0.36} + + 39%|███▊ | 396/1024 [18:06:06<29:14:32, 167.63s/it]INFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:31:06 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▉ | 397/1024 [18:09:09<29:58:12, 172.08s/it] + {'loss': 0.0676, 'grad_norm': 0.002546454081311822, 'learning_rate': 1e-05, 'num_tokens': 348395842.0, 'completions/mean_length': 7131.7109375, 'completions/min_length': 821.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6833.25, 'completions/min_terminated_length': 821.0, 'completions/max_terminated_length': 15761.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2869499623775482, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0193922221660614, 'sampling/sampling_logp_difference/max': 8.436627388000488, 'sampling/importance_sampling_ratio/min': 0.0002167800412280485, 'sampling/importance_sampling_ratio/mean': 0.999964714050293, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8575824722647667, 'clip_ratio/low_mean': 6.443337406381033e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6659830609787605e-06, 'clip_ratio/high_max': 1.0663932243915042e-05, 'clip_ratio/region_mean': 6.709935701110226e-05, 'epoch': 0.37} + + 39%|███▉ | 397/1024 [18:09:09<29:58:12, 172.08s/it]INFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:34:08 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▉ | 398/1024 [18:11:52<29:26:57, 169.36s/it] + {'loss': 0.0818, 'grad_norm': 0.00492837093770504, 'learning_rate': 1e-05, 'num_tokens': 349292790.0, 'completions/mean_length': 6858.34375, 'completions/min_length': 772.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6707.14306640625, 'completions/min_terminated_length': 772.0, 'completions/max_terminated_length': 16200.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.1949220597743988, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020318543538451195, 'sampling/sampling_logp_difference/max': 6.79857063293457, 'sampling/importance_sampling_ratio/min': 0.0011153683299198747, 'sampling/importance_sampling_ratio/mean': 0.9998850226402283, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9539813920855522, 'clip_ratio/low_mean': 3.932982110654848e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.618344165573944e-07, 'clip_ratio/high_max': 3.847337666229578e-06, 'clip_ratio/region_mean': 4.029165563679271e-05, 'epoch': 0.37} + + 39%|███▉ | 398/1024 [18:11:52<29:26:57, 169.36s/it]INFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:36:51 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-02 07:38:48,166 - math_verify.grader - WARNING - Timeout during comparison + + 39%|███▉ | 399/1024 [18:14:51<29:56:15, 172.44s/it] + {'loss': 0.0273, 'grad_norm': 0.004895905964076519, 'learning_rate': 1e-05, 'num_tokens': 350312556.0, 'completions/mean_length': 7809.984375, 'completions/min_length': 1002.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7533.40283203125, 'completions/min_terminated_length': 1002.0, 'completions/max_terminated_length': 15261.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.22567616403102875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018754083663225174, 'sampling/sampling_logp_difference/max': 7.0799760818481445, 'sampling/importance_sampling_ratio/min': 0.0008417933131568134, 'sampling/importance_sampling_ratio/mean': 0.9999260306358337, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8353303670883179, 'clip_ratio/low_mean': 3.8245348378040944e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.22843152389396e-06, 'clip_ratio/high_max': 1.291372609557584e-05, 'clip_ratio/region_mean': 4.1473780811429606e-05, 'epoch': 0.37} + + 39%|███▉ | 399/1024 [18:14:51<29:56:15, 172.44s/it]INFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:39:51 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▉ | 400/1024 [18:17:33<29:19:50, 169.21s/it] + {'loss': 0.0402, 'grad_norm': 0.0032397822942584753, 'learning_rate': 1e-05, 'num_tokens': 351252755.0, 'completions/mean_length': 7194.9296875, 'completions/min_length': 233.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6821.39013671875, 'completions/min_terminated_length': 233.0, 'completions/max_terminated_length': 15057.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.19438527524471283, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02105094864964485, 'sampling/sampling_logp_difference/max': 8.370504379272461, 'sampling/importance_sampling_ratio/min': 0.00023159870761446655, 'sampling/importance_sampling_ratio/mean': 0.9998766183853149, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9744522422552109, 'clip_ratio/low_mean': 3.196108968950284e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5690324011738994e-06, 'clip_ratio/high_max': 1.1250081115576904e-05, 'clip_ratio/region_mean': 3.553012152224255e-05, 'epoch': 0.37} + + 39%|███▉ | 400/1024 [18:17:33<29:19:50, 169.21s/it]INFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:42:33 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▉ | 401/1024 [18:20:13<28:48:34, 166.48s/it] + {'loss': 0.0424, 'grad_norm': 0.0031576494220644236, 'learning_rate': 1e-05, 'num_tokens': 352145873.0, 'completions/mean_length': 6836.234375, 'completions/min_length': 379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6607.08837890625, 'completions/min_terminated_length': 379.0, 'completions/max_terminated_length': 15745.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020445333793759346, 'sampling/sampling_logp_difference/max': 6.727474689483643, 'sampling/importance_sampling_ratio/min': 0.0011975533561781049, 'sampling/importance_sampling_ratio/mean': 0.9999266862869263, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9149863049387932, 'clip_ratio/low_mean': 2.2670621888210007e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7451138774049468e-06, 'clip_ratio/high_max': 6.980455509619787e-06, 'clip_ratio/region_mean': 2.441573599298863e-05, 'epoch': 0.37} + + 39%|███▉ | 401/1024 [18:20:13<28:48:34, 166.48s/it]INFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:45:13 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▉ | 402/1024 [18:23:03<28:57:20, 167.59s/it] + {'loss': 0.051, 'grad_norm': 0.003970830701291561, 'learning_rate': 1e-05, 'num_tokens': 353056405.0, 'completions/mean_length': 6942.65625, 'completions/min_length': 175.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6638.0966796875, 'completions/min_terminated_length': 175.0, 'completions/max_terminated_length': 16380.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3282659649848938, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018101349472999573, 'sampling/sampling_logp_difference/max': 11.687329292297363, 'sampling/importance_sampling_ratio/min': 8.399576472584158e-06, 'sampling/importance_sampling_ratio/mean': 1.0000462532043457, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7541583999991417, 'clip_ratio/low_mean': 5.359988131203863e-05, 'clip_ratio/low_min': 1.3856095392839052e-05, 'clip_ratio/high_mean': 5.889334147468617e-06, 'clip_ratio/high_max': 2.3557336589874467e-05, 'clip_ratio/region_mean': 5.9489215118446737e-05, 'epoch': 0.37} + + 39%|███▉ | 402/1024 [18:23:03<28:57:20, 167.59s/it]INFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:48:03 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▉ | 403/1024 [18:25:37<28:10:05, 163.29s/it] + {'loss': 0.029, 'grad_norm': 0.0043656788766384125, 'learning_rate': 1e-05, 'num_tokens': 353844661.0, 'completions/mean_length': 6022.1875, 'completions/min_length': 1285.0, 'completions/max_length': 14786.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6022.1875, 'completions/min_terminated_length': 1285.0, 'completions/max_terminated_length': 14786.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.22225631773471832, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020655371248722076, 'sampling/sampling_logp_difference/max': 2.9993722438812256, 'sampling/importance_sampling_ratio/min': 0.04981832951307297, 'sampling/importance_sampling_ratio/mean': 0.9999772310256958, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9535745903849602, 'clip_ratio/low_mean': 1.968103515537223e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.517377525800839e-06, 'clip_ratio/high_max': 2.6139805413549766e-05, 'clip_ratio/region_mean': 2.7198412681173068e-05, 'epoch': 0.37} + + 39%|███▉ | 403/1024 [18:25:37<28:10:05, 163.29s/it]INFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:50:36 [block_pool.py:292] Successfully reset prefix cache + + 39%|███▉ | 404/1024 [18:28:31<28:42:05, 166.65s/it] + {'loss': 0.006, 'grad_norm': 0.006543307099491358, 'learning_rate': 1e-05, 'num_tokens': 354894689.0, 'completions/mean_length': 8068.96875, 'completions/min_length': 468.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7869.408203125, 'completions/min_terminated_length': 468.0, 'completions/max_terminated_length': 15906.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.24988999962806702, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021852033212780952, 'sampling/sampling_logp_difference/max': 9.614944458007812, 'sampling/importance_sampling_ratio/min': 6.672408926533535e-05, 'sampling/importance_sampling_ratio/mean': 0.9999514818191528, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9473539590835571, 'clip_ratio/low_mean': 5.21388310517068e-05, 'clip_ratio/low_min': 2.633131089169183e-06, 'clip_ratio/high_mean': 2.9508817647183605e-06, 'clip_ratio/high_max': 9.152076700047473e-06, 'clip_ratio/region_mean': 5.508971298695542e-05, 'epoch': 0.37} + + 39%|███▉ | 404/1024 [18:28:31<28:42:05, 166.65s/it]INFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:53:31 [block_pool.py:292] Successfully reset prefix cache + + 40%|███▉ | 405/1024 [18:31:09<28:11:11, 163.93s/it] + {'loss': 0.0293, 'grad_norm': 0.003351036459207535, 'learning_rate': 1e-05, 'num_tokens': 355677273.0, 'completions/mean_length': 5960.1875, 'completions/min_length': 491.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5878.1103515625, 'completions/min_terminated_length': 491.0, 'completions/max_terminated_length': 15748.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.31642353534698486, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.021779976785182953, 'sampling/sampling_logp_difference/max': 6.656237602233887, 'sampling/importance_sampling_ratio/min': 0.0012859756825491786, 'sampling/importance_sampling_ratio/mean': 0.9999220371246338, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9564141109585762, 'clip_ratio/low_mean': 5.5152235972855124e-05, 'clip_ratio/low_min': 1.0455875781190116e-05, 'clip_ratio/high_mean': 7.4048172109542065e-06, 'clip_ratio/high_max': 2.9619268843816826e-05, 'clip_ratio/region_mean': 6.255705375224352e-05, 'epoch': 0.37} + + 40%|███▉ | 405/1024 [18:31:09<28:11:11, 163.93s/it]INFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:56:08 [block_pool.py:292] Successfully reset prefix cache + + 40%|███▉ | 406/1024 [18:34:06<28:49:01, 167.87s/it] + {'loss': 0.039, 'grad_norm': 0.0031219006050378084, 'learning_rate': 1e-05, 'num_tokens': 356675829.0, 'completions/mean_length': 7620.21875, 'completions/min_length': 328.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7189.212890625, 'completions/min_terminated_length': 328.0, 'completions/max_terminated_length': 15669.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.1751839816570282, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021951109170913696, 'sampling/sampling_logp_difference/max': 4.591080188751221, 'sampling/importance_sampling_ratio/min': 0.010141897015273571, 'sampling/importance_sampling_ratio/mean': 1.0001060962677002, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.035948596894741, 'clip_ratio/low_mean': 3.758041248147492e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.989421491543908e-06, 'clip_ratio/high_max': 7.957685966175632e-06, 'clip_ratio/region_mean': 3.956983414354909e-05, 'epoch': 0.37} + + 40%|███▉ | 406/1024 [18:34:06<28:49:01, 167.87s/it]INFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 07:59:05 [block_pool.py:292] Successfully reset prefix cache + + 40%|███▉ | 407/1024 [18:36:39<28:00:30, 163.42s/it] + {'loss': 0.0471, 'grad_norm': 0.002810312667861581, 'learning_rate': 1e-05, 'num_tokens': 357438712.0, 'completions/mean_length': 5806.0234375, 'completions/min_length': 1319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5638.119140625, 'completions/min_terminated_length': 1319.0, 'completions/max_terminated_length': 14038.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.22832970321178436, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01965375244617462, 'sampling/sampling_logp_difference/max': 6.747459888458252, 'sampling/importance_sampling_ratio/min': 0.0011738575994968414, 'sampling/importance_sampling_ratio/mean': 0.9999280571937561, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8977029845118523, 'clip_ratio/low_mean': 3.914574369900947e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.7169204978890775e-06, 'clip_ratio/high_max': 2.286768199155631e-05, 'clip_ratio/region_mean': 4.486266482217616e-05, 'epoch': 0.37} + + 40%|███▉ | 407/1024 [18:36:39<28:00:30, 163.42s/it]INFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:01:38 [block_pool.py:292] Successfully reset prefix cache + + 40%|███▉ | 408/1024 [18:39:04<27:01:59, 157.99s/it] + {'loss': 0.0517, 'grad_norm': 0.004516562446951866, 'learning_rate': 1e-05, 'num_tokens': 358296731.0, 'completions/mean_length': 6537.4609375, 'completions/min_length': 842.0, 'completions/max_length': 15705.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6537.4609375, 'completions/min_terminated_length': 842.0, 'completions/max_terminated_length': 15705.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.1830746978521347, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021242395043373108, 'sampling/sampling_logp_difference/max': 12.946335792541504, 'sampling/importance_sampling_ratio/min': 2.384942035860149e-06, 'sampling/importance_sampling_ratio/mean': 0.9999170303344727, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9577726796269417, 'clip_ratio/low_mean': 3.186109779562685e-05, 'clip_ratio/low_min': 4.3511558942554984e-06, 'clip_ratio/high_mean': 3.054844910366228e-06, 'clip_ratio/high_max': 1.2219379641464911e-05, 'clip_ratio/region_mean': 3.4915943160740426e-05, 'epoch': 0.38} + + 40%|███▉ | 408/1024 [18:39:04<27:01:59, 157.99s/it]INFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:04:04 [block_pool.py:292] Successfully reset prefix cache + + 40%|███▉ | 409/1024 [18:41:51<27:27:01, 160.69s/it] + {'loss': 0.05, 'grad_norm': 0.003542230697348714, 'learning_rate': 1e-05, 'num_tokens': 359327001.0, 'completions/mean_length': 7896.671875, 'completions/min_length': 1047.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7622.88671875, 'completions/min_terminated_length': 1047.0, 'completions/max_terminated_length': 16360.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.23645778000354767, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020085681229829788, 'sampling/sampling_logp_difference/max': 9.124931335449219, 'sampling/importance_sampling_ratio/min': 0.00010891625424847007, 'sampling/importance_sampling_ratio/mean': 0.9998560547828674, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9163230583071709, 'clip_ratio/low_mean': 3.026239573955536e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6056723047295236e-06, 'clip_ratio/high_max': 1.4422689218918094e-05, 'clip_ratio/region_mean': 3.3868068385345396e-05, 'epoch': 0.38} + + 40%|███▉ | 409/1024 [18:41:51<27:27:01, 160.69s/it]INFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:06:51 [block_pool.py:292] Successfully reset prefix cache + + 40%|████ | 410/1024 [18:44:21<26:52:06, 157.53s/it] + {'loss': 0.0518, 'grad_norm': 0.0035069347359240055, 'learning_rate': 1e-05, 'num_tokens': 360208780.0, 'completions/mean_length': 6728.7109375, 'completions/min_length': 454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6652.68505859375, 'completions/min_terminated_length': 454.0, 'completions/max_terminated_length': 15297.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.29932135343551636, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021022530272603035, 'sampling/sampling_logp_difference/max': 11.124998092651367, 'sampling/importance_sampling_ratio/min': 1.4739226571691688e-05, 'sampling/importance_sampling_ratio/mean': 0.9999571442604065, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9010183215141296, 'clip_ratio/low_mean': 4.2465159026505717e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.474494003010477e-06, 'clip_ratio/high_max': 1.7827243254942005e-05, 'clip_ratio/region_mean': 4.793965263161226e-05, 'epoch': 0.38} + + 40%|████ | 410/1024 [18:44:21<26:52:06, 157.53s/it]INFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:09:21 [block_pool.py:292] Successfully reset prefix cache + + 40%|████ | 411/1024 [18:47:25<28:10:49, 165.50s/it] + {'loss': 0.0221, 'grad_norm': 0.0033910400234162807, 'learning_rate': 1e-05, 'num_tokens': 361098567.0, 'completions/mean_length': 6800.3984375, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6491.25, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 16167.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2306838035583496, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019660964608192444, 'sampling/sampling_logp_difference/max': 6.536596298217773, 'sampling/importance_sampling_ratio/min': 0.001449413481168449, 'sampling/importance_sampling_ratio/mean': 0.9998576641082764, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8654960840940475, 'clip_ratio/low_mean': 2.8587513156708155e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.594247348497447e-06, 'clip_ratio/high_max': 1.0376989393989788e-05, 'clip_ratio/region_mean': 3.1181759936771414e-05, 'epoch': 0.38} + + 40%|████ | 411/1024 [18:47:25<28:10:49, 165.50s/it]INFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:12:25 [block_pool.py:292] Successfully reset prefix cache + + 40%|████ | 412/1024 [18:50:04<27:48:41, 163.60s/it] + {'loss': 0.0834, 'grad_norm': 0.0036110079381614923, 'learning_rate': 1e-05, 'num_tokens': 362027520.0, 'completions/mean_length': 7103.4453125, 'completions/min_length': 1711.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6956.13525390625, 'completions/min_terminated_length': 1711.0, 'completions/max_terminated_length': 16373.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.33797892928123474, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01939362846314907, 'sampling/sampling_logp_difference/max': 11.458046913146973, 'sampling/importance_sampling_ratio/min': 1.0564122931100428e-05, 'sampling/importance_sampling_ratio/mean': 0.9999338984489441, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8317076042294502, 'clip_ratio/low_mean': 5.8515578757578623e-05, 'clip_ratio/low_min': 1.0348648629587842e-05, 'clip_ratio/high_mean': 7.792090059410839e-06, 'clip_ratio/high_max': 2.3068858354236e-05, 'clip_ratio/region_mean': 6.630766870330262e-05, 'epoch': 0.38} + + 40%|████ | 412/1024 [18:50:04<27:48:41, 163.60s/it]INFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:15:04 [block_pool.py:292] Successfully reset prefix cache + + 40%|████ | 413/1024 [18:52:56<28:10:21, 165.99s/it] + {'loss': 0.0756, 'grad_norm': 0.002141098491847515, 'learning_rate': 1e-05, 'num_tokens': 362985207.0, 'completions/mean_length': 7344.9296875, 'completions/min_length': 1368.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6900.384765625, 'completions/min_terminated_length': 1368.0, 'completions/max_terminated_length': 15830.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.28930896520614624, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01929464004933834, 'sampling/sampling_logp_difference/max': 10.874617576599121, 'sampling/importance_sampling_ratio/min': 1.8932745661004446e-05, 'sampling/importance_sampling_ratio/mean': 0.9999322891235352, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8387318029999733, 'clip_ratio/low_mean': 5.127149995587388e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.780986948091595e-07, 'clip_ratio/high_max': 3.112394779236638e-06, 'clip_ratio/region_mean': 5.204959859383962e-05, 'epoch': 0.38} + + 40%|████ | 413/1024 [18:52:56<28:10:21, 165.99s/it]INFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:17:56 [block_pool.py:292] Successfully reset prefix cache + + 40%|████ | 414/1024 [18:55:51<28:36:34, 168.84s/it] + {'loss': 0.0608, 'grad_norm': 0.0015244127716869116, 'learning_rate': 1e-05, 'num_tokens': 363823914.0, 'completions/mean_length': 6377.7734375, 'completions/min_length': 839.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6218.94482421875, 'completions/min_terminated_length': 839.0, 'completions/max_terminated_length': 16137.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.1988610327243805, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020688029006123543, 'sampling/sampling_logp_difference/max': 5.061592102050781, 'sampling/importance_sampling_ratio/min': 0.006335465237498283, 'sampling/importance_sampling_ratio/mean': 0.9999363422393799, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9732858911156654, 'clip_ratio/low_mean': 1.7854434247510653e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3150696531738504e-06, 'clip_ratio/high_max': 5.2602786126954015e-06, 'clip_ratio/region_mean': 1.9169503786997666e-05, 'epoch': 0.38} + + 40%|████ | 414/1024 [18:55:51<28:36:34, 168.84s/it]INFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:20:51 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 415/1024 [18:58:31<28:05:27, 166.05s/it] + {'loss': 0.0311, 'grad_norm': 0.002647512126713991, 'learning_rate': 1e-05, 'num_tokens': 364561127.0, 'completions/mean_length': 5599.7890625, 'completions/min_length': 422.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5340.96826171875, 'completions/min_terminated_length': 422.0, 'completions/max_terminated_length': 14456.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01878243312239647, 'sampling/sampling_logp_difference/max': 12.952398300170898, 'sampling/importance_sampling_ratio/min': 2.370526999584399e-06, 'sampling/importance_sampling_ratio/mean': 0.9999077916145325, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8872368410229683, 'clip_ratio/low_mean': 3.3802934012783226e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.548875148837396e-06, 'clip_ratio/high_max': 2.6195500595349586e-05, 'clip_ratio/region_mean': 4.035180882056011e-05, 'epoch': 0.38} + + 41%|████ | 415/1024 [18:58:31<28:05:27, 166.05s/it]INFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:23:31 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 416/1024 [19:01:26<28:30:45, 168.83s/it] + {'loss': 0.0541, 'grad_norm': 0.0018051012884825468, 'learning_rate': 1e-05, 'num_tokens': 365590124.0, 'completions/mean_length': 7877.2890625, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7385.1650390625, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 15905.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.28407180309295654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019809434190392494, 'sampling/sampling_logp_difference/max': 7.800533294677734, 'sampling/importance_sampling_ratio/min': 0.0004095165350008756, 'sampling/importance_sampling_ratio/mean': 0.9999774694442749, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8416353687644005, 'clip_ratio/low_mean': 7.215861739950924e-05, 'clip_ratio/low_min': 1.4898997051204788e-05, 'clip_ratio/high_mean': 5.3931973980070325e-06, 'clip_ratio/high_max': 2.157278959202813e-05, 'clip_ratio/region_mean': 7.755181559332414e-05, 'epoch': 0.38} + + 41%|████ | 416/1024 [19:01:26<28:30:45, 168.83s/it]INFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:26:26 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 417/1024 [19:04:11<28:15:50, 167.63s/it] + {'loss': 0.0146, 'grad_norm': 0.004550795070827007, 'learning_rate': 1e-05, 'num_tokens': 366486337.0, 'completions/mean_length': 6836.7890625, 'completions/min_length': 909.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6200.30859375, 'completions/min_terminated_length': 909.0, 'completions/max_terminated_length': 16083.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.22620806097984314, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01992485672235489, 'sampling/sampling_logp_difference/max': 9.124993324279785, 'sampling/importance_sampling_ratio/min': 0.0001089095021598041, 'sampling/importance_sampling_ratio/mean': 0.9999873638153076, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8647575601935387, 'clip_ratio/low_mean': 4.230594890941575e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.352486593641515e-06, 'clip_ratio/high_max': 2.540994637456606e-05, 'clip_ratio/region_mean': 4.8658435844117776e-05, 'epoch': 0.38} + + 41%|████ | 417/1024 [19:04:11<28:15:50, 167.63s/it]INFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:29:11 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 418/1024 [19:06:47<27:37:23, 164.10s/it] + {'loss': 0.1054, 'grad_norm': 0.005958946421742439, 'learning_rate': 1e-05, 'num_tokens': 367386163.0, 'completions/mean_length': 6884.953125, 'completions/min_length': 1289.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6417.78662109375, 'completions/min_terminated_length': 1289.0, 'completions/max_terminated_length': 16286.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019436441361904144, 'sampling/sampling_logp_difference/max': 11.562139511108398, 'sampling/importance_sampling_ratio/min': 9.519772902422119e-06, 'sampling/importance_sampling_ratio/mean': 1.0000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8691708743572235, 'clip_ratio/low_mean': 3.5717548257707676e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.8981661444049678e-06, 'clip_ratio/high_max': 1.1592664577619871e-05, 'clip_ratio/region_mean': 3.861571451579948e-05, 'epoch': 0.38} + + 41%|████ | 418/1024 [19:06:47<27:37:23, 164.10s/it]INFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:31:47 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 419/1024 [19:09:35<27:46:02, 165.23s/it] + {'loss': 0.1918, 'grad_norm': 0.00558120384812355, 'learning_rate': 1e-05, 'num_tokens': 368357500.0, 'completions/mean_length': 7439.1328125, 'completions/min_length': 938.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7150.58837890625, 'completions/min_terminated_length': 938.0, 'completions/max_terminated_length': 15574.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.3795146346092224, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018824251368641853, 'sampling/sampling_logp_difference/max': 9.062491416931152, 'sampling/importance_sampling_ratio/min': 0.0001159337698481977, 'sampling/importance_sampling_ratio/mean': 0.9999570250511169, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.795464999973774, 'clip_ratio/low_mean': 3.938097847822064e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.455849524580117e-06, 'clip_ratio/high_max': 2.7658640192385064e-05, 'clip_ratio/region_mean': 4.7836828116487595e-05, 'epoch': 0.39} + + 41%|████ | 419/1024 [19:09:35<27:46:02, 165.23s/it]INFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:34:35 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 420/1024 [19:12:18<27:35:39, 164.47s/it] + {'loss': 0.0859, 'grad_norm': 0.004628168884664774, 'learning_rate': 1e-05, 'num_tokens': 369242920.0, 'completions/mean_length': 6751.53125, 'completions/min_length': 715.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6520.3525390625, 'completions/min_terminated_length': 715.0, 'completions/max_terminated_length': 16236.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.20859163999557495, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019376013427972794, 'sampling/sampling_logp_difference/max': 7.406209468841553, 'sampling/importance_sampling_ratio/min': 0.0006074689445085824, 'sampling/importance_sampling_ratio/mean': 0.9999655485153198, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9450879693031311, 'clip_ratio/low_mean': 3.0958593640662e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1273888819450804e-06, 'clip_ratio/high_max': 8.509555527780321e-06, 'clip_ratio/region_mean': 3.308598269313734e-05, 'epoch': 0.39} + + 41%|████ | 420/1024 [19:12:18<27:35:39, 164.47s/it]INFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:37:17 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 421/1024 [19:15:33<29:05:45, 173.71s/it] + {'loss': 0.1066, 'grad_norm': 0.00389425759203732, 'learning_rate': 1e-05, 'num_tokens': 370159510.0, 'completions/mean_length': 7023.296875, 'completions/min_length': 1628.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6315.3447265625, 'completions/min_terminated_length': 1628.0, 'completions/max_terminated_length': 16164.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.323777437210083, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.016914553940296173, 'sampling/sampling_logp_difference/max': 8.872963905334473, 'sampling/importance_sampling_ratio/min': 0.00014012664905749261, 'sampling/importance_sampling_ratio/mean': 0.9999127388000488, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7378111630678177, 'clip_ratio/low_mean': 4.86290555272717e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.572105126499082e-06, 'clip_ratio/high_max': 1.8288420505996328e-05, 'clip_ratio/region_mean': 5.320115997164976e-05, 'epoch': 0.39} + + 41%|████ | 421/1024 [19:15:33<29:05:45, 173.71s/it]INFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:40:32 [block_pool.py:292] Successfully reset prefix cache + + 41%|████ | 422/1024 [19:18:14<28:25:39, 170.00s/it] + {'loss': 0.0149, 'grad_norm': 0.004324545152485371, 'learning_rate': 1e-05, 'num_tokens': 371162773.0, 'completions/mean_length': 7702.3046875, 'completions/min_length': 423.0, 'completions/max_length': 16018.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7702.3046875, 'completions/min_terminated_length': 423.0, 'completions/max_terminated_length': 16018.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.23250606656074524, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020495830103754997, 'sampling/sampling_logp_difference/max': 10.687313079833984, 'sampling/importance_sampling_ratio/min': 2.283278627146501e-05, 'sampling/importance_sampling_ratio/mean': 1.00001060962677, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9053447172045708, 'clip_ratio/low_mean': 2.3538930747690756e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.948400371380558e-06, 'clip_ratio/high_max': 2.1269573153404053e-05, 'clip_ratio/region_mean': 2.9487331687505502e-05, 'epoch': 0.39} + + 41%|████ | 422/1024 [19:18:14<28:25:39, 170.00s/it]INFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:43:14 [block_pool.py:292] Successfully reset prefix cache + + 41%|████▏ | 423/1024 [19:20:52<27:45:32, 166.28s/it] + {'loss': 0.0237, 'grad_norm': 0.003239463549107313, 'learning_rate': 1e-05, 'num_tokens': 372067241.0, 'completions/mean_length': 6904.78125, 'completions/min_length': 432.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6754.31787109375, 'completions/min_terminated_length': 432.0, 'completions/max_terminated_length': 15295.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.32719242572784424, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019042208790779114, 'sampling/sampling_logp_difference/max': 8.999999046325684, 'sampling/importance_sampling_ratio/min': 0.00012340991816017777, 'sampling/importance_sampling_ratio/mean': 0.9999598264694214, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7991176024079323, 'clip_ratio/low_mean': 5.831611520079605e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.5737292048688687e-06, 'clip_ratio/high_max': 1.0294916819475475e-05, 'clip_ratio/region_mean': 6.088984559937671e-05, 'epoch': 0.39} + + 41%|████▏ | 423/1024 [19:20:52<27:45:32, 166.28s/it]INFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:45:51 [block_pool.py:292] Successfully reset prefix cache + + 41%|████▏ | 424/1024 [19:23:34<27:30:52, 165.09s/it] + {'loss': 0.0587, 'grad_norm': 0.0015464330790564418, 'learning_rate': 1e-05, 'num_tokens': 372866072.0, 'completions/mean_length': 6107.7421875, 'completions/min_length': 89.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5602.35205078125, 'completions/min_terminated_length': 89.0, 'completions/max_terminated_length': 15399.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.1820138692855835, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019793221727013588, 'sampling/sampling_logp_difference/max': 8.306756019592285, 'sampling/importance_sampling_ratio/min': 0.00024684349773451686, 'sampling/importance_sampling_ratio/mean': 0.999971330165863, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9495253190398216, 'clip_ratio/low_mean': 1.552133551285806e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.926559305815317e-06, 'clip_ratio/high_max': 2.7261318791715894e-05, 'clip_ratio/region_mean': 2.3447895273420727e-05, 'epoch': 0.39} + + 41%|████▏ | 424/1024 [19:23:34<27:30:52, 165.09s/it]INFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:48:34 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 425/1024 [19:26:12<27:07:00, 162.97s/it] + {'loss': 0.1124, 'grad_norm': 0.0024811832699924707, 'learning_rate': 1e-05, 'num_tokens': 373663463.0, 'completions/mean_length': 6079.8046875, 'completions/min_length': 1082.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5747.4111328125, 'completions/min_terminated_length': 1082.0, 'completions/max_terminated_length': 15939.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.2630355656147003, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.017151469364762306, 'sampling/sampling_logp_difference/max': 8.550286293029785, 'sampling/importance_sampling_ratio/min': 0.00019348970090504736, 'sampling/importance_sampling_ratio/mean': 0.9999743103981018, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8005363270640373, 'clip_ratio/low_mean': 3.261690835643094e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.533324717063806e-06, 'clip_ratio/high_max': 2.457227401464479e-05, 'clip_ratio/region_mean': 4.115023284612107e-05, 'epoch': 0.39} + + 42%|████▏ | 425/1024 [19:26:12<27:07:00, 162.97s/it]INFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:51:12 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 426/1024 [19:28:51<26:52:48, 161.82s/it] + {'loss': 0.0959, 'grad_norm': 0.0031475063879042864, 'learning_rate': 1e-05, 'num_tokens': 374517492.0, 'completions/mean_length': 6453.7890625, 'completions/min_length': 347.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6375.5986328125, 'completions/min_terminated_length': 347.0, 'completions/max_terminated_length': 14925.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.19910329580307007, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019899867475032806, 'sampling/sampling_logp_difference/max': 4.156344890594482, 'sampling/importance_sampling_ratio/min': 0.015664709731936455, 'sampling/importance_sampling_ratio/mean': 0.9999594688415527, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9212624430656433, 'clip_ratio/low_mean': 2.132218082806503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.429997251369059e-07, 'clip_ratio/high_max': 3.3719989005476236e-06, 'clip_ratio/region_mean': 2.2165180553201935e-05, 'epoch': 0.39} + + 42%|████▏ | 426/1024 [19:28:51<26:52:48, 161.82s/it]INFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:53:51 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 427/1024 [19:31:16<26:00:01, 156.79s/it] + {'loss': 0.0276, 'grad_norm': 0.004200868774205446, 'learning_rate': 1e-05, 'num_tokens': 375320339.0, 'completions/mean_length': 6126.9921875, 'completions/min_length': 1106.0, 'completions/max_length': 16159.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6126.9921875, 'completions/min_terminated_length': 1106.0, 'completions/max_terminated_length': 16159.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.1830747127532959, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01833093911409378, 'sampling/sampling_logp_difference/max': 5.156249046325684, 'sampling/importance_sampling_ratio/min': 0.005763276945799589, 'sampling/importance_sampling_ratio/mean': 0.9999815225601196, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8252849578857422, 'clip_ratio/low_mean': 3.784128080042137e-05, 'clip_ratio/low_min': 3.7751804029539926e-06, 'clip_ratio/high_mean': 5.984868664654641e-06, 'clip_ratio/high_max': 1.907509408738406e-05, 'clip_ratio/region_mean': 4.382614952191943e-05, 'epoch': 0.39} + + 42%|████▏ | 427/1024 [19:31:16<26:00:01, 156.79s/it]INFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:56:16 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 428/1024 [19:34:03<26:27:09, 159.78s/it] + {'loss': 0.0481, 'grad_norm': 0.003204014617949724, 'learning_rate': 1e-05, 'num_tokens': 376201015.0, 'completions/mean_length': 6739.09375, 'completions/min_length': 1228.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6427.9677734375, 'completions/min_terminated_length': 1228.0, 'completions/max_terminated_length': 15411.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.37086254358291626, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018961725756525993, 'sampling/sampling_logp_difference/max': 9.195985794067383, 'sampling/importance_sampling_ratio/min': 0.00010144581028725952, 'sampling/importance_sampling_ratio/mean': 0.9998303651809692, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8008574098348618, 'clip_ratio/low_mean': 6.169724406390742e-05, 'clip_ratio/low_min': 7.494657666029525e-06, 'clip_ratio/high_mean': 5.476571459439583e-06, 'clip_ratio/high_max': 1.8918785372079583e-05, 'clip_ratio/region_mean': 6.717381506859965e-05, 'epoch': 0.39} + + 42%|████▏ | 428/1024 [19:34:03<26:27:09, 159.78s/it]INFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 08:59:03 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 429/1024 [19:36:58<27:10:19, 164.40s/it] + {'loss': 0.0299, 'grad_norm': 0.0039763906970620155, 'learning_rate': 1e-05, 'num_tokens': 377149650.0, 'completions/mean_length': 7245.8984375, 'completions/min_length': 1306.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6951.12060546875, 'completions/min_terminated_length': 1306.0, 'completions/max_terminated_length': 15634.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020948028191924095, 'sampling/sampling_logp_difference/max': 9.420292854309082, 'sampling/importance_sampling_ratio/min': 8.106228051474318e-05, 'sampling/importance_sampling_ratio/mean': 1.0000600814819336, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0351596996188164, 'clip_ratio/low_mean': 5.3925050679026754e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.389697269540193e-06, 'clip_ratio/high_max': 1.3558789078160771e-05, 'clip_ratio/region_mean': 5.731474743697618e-05, 'epoch': 0.39} + + 42%|████▏ | 429/1024 [19:36:58<27:10:19, 164.40s/it]INFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:01:58 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 430/1024 [19:39:49<27:26:35, 166.32s/it] + {'loss': 0.0195, 'grad_norm': 0.0031417158897966146, 'learning_rate': 1e-05, 'num_tokens': 378057802.0, 'completions/mean_length': 6958.625, 'completions/min_length': 1047.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6495.08154296875, 'completions/min_terminated_length': 1047.0, 'completions/max_terminated_length': 15608.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.35771697759628296, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019185224547982216, 'sampling/sampling_logp_difference/max': 9.187026023864746, 'sampling/importance_sampling_ratio/min': 0.00010235882655251771, 'sampling/importance_sampling_ratio/mean': 0.9999384880065918, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8360240310430527, 'clip_ratio/low_mean': 4.6149686397711775e-05, 'clip_ratio/low_min': 3.006686938533676e-06, 'clip_ratio/high_mean': 4.259903903403028e-06, 'clip_ratio/high_max': 1.4580486549675697e-05, 'clip_ratio/region_mean': 5.04095905853319e-05, 'epoch': 0.4} + + 42%|████▏ | 430/1024 [19:39:49<27:26:35, 166.32s/it]INFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:04:49 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 431/1024 [19:42:18<26:32:33, 161.14s/it] + {'loss': 0.096, 'grad_norm': 0.004943124484270811, 'learning_rate': 1e-05, 'num_tokens': 378808021.0, 'completions/mean_length': 5696.3984375, 'completions/min_length': 312.0, 'completions/max_length': 15410.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5696.3984375, 'completions/min_terminated_length': 312.0, 'completions/max_terminated_length': 15410.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.31246691942214966, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018845941871404648, 'sampling/sampling_logp_difference/max': 6.499474048614502, 'sampling/importance_sampling_ratio/min': 0.0015042300801724195, 'sampling/importance_sampling_ratio/mean': 0.9999057054519653, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7887749597430229, 'clip_ratio/low_mean': 5.096616632727091e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6704084373486694e-06, 'clip_ratio/high_max': 6.681633749394678e-06, 'clip_ratio/region_mean': 5.263657521936693e-05, 'epoch': 0.4} + + 42%|████▏ | 431/1024 [19:42:18<26:32:33, 161.14s/it]INFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:07:18 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 432/1024 [19:44:57<26:21:59, 160.34s/it] + {'loss': 0.0546, 'grad_norm': 0.00595651101320982, 'learning_rate': 1e-05, 'num_tokens': 379659710.0, 'completions/mean_length': 6480.8828125, 'completions/min_length': 1013.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6323.69091796875, 'completions/min_terminated_length': 1013.0, 'completions/max_terminated_length': 14233.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01906527951359749, 'sampling/sampling_logp_difference/max': 6.325125217437744, 'sampling/importance_sampling_ratio/min': 0.0017907419241964817, 'sampling/importance_sampling_ratio/mean': 0.9998855590820312, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8796411231160164, 'clip_ratio/low_mean': 3.513921649300755e-05, 'clip_ratio/low_min': 6.075038982089609e-06, 'clip_ratio/high_mean': 5.417880970526312e-06, 'clip_ratio/high_max': 1.7526824194646906e-05, 'clip_ratio/region_mean': 4.0557096895099676e-05, 'epoch': 0.4} + + 42%|████▏ | 432/1024 [19:44:57<26:21:59, 160.34s/it]INFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:09:56 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 433/1024 [19:47:51<27:00:53, 164.56s/it] + {'loss': 0.0683, 'grad_norm': 0.0024527597706764936, 'learning_rate': 1e-05, 'num_tokens': 380640720.0, 'completions/mean_length': 7501.703125, 'completions/min_length': 680.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6829.93310546875, 'completions/min_terminated_length': 680.0, 'completions/max_terminated_length': 16204.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01873261108994484, 'sampling/sampling_logp_difference/max': 13.93749713897705, 'sampling/importance_sampling_ratio/min': 8.851602615322918e-07, 'sampling/importance_sampling_ratio/mean': 0.9999595880508423, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.786028303205967, 'clip_ratio/low_mean': 2.4512424602107785e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.4512424602107785e-05, 'epoch': 0.4} + + 42%|████▏ | 433/1024 [19:47:51<27:00:53, 164.56s/it]INFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:12:51 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 434/1024 [19:50:20<26:12:59, 159.97s/it] + {'loss': 0.0514, 'grad_norm': 0.004280989523977041, 'learning_rate': 1e-05, 'num_tokens': 381377981.0, 'completions/mean_length': 5619.2890625, 'completions/min_length': 602.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5448.4208984375, 'completions/min_terminated_length': 602.0, 'completions/max_terminated_length': 15185.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.2398776412010193, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017923470586538315, 'sampling/sampling_logp_difference/max': 6.883193492889404, 'sampling/importance_sampling_ratio/min': 0.0010248658945783973, 'sampling/importance_sampling_ratio/mean': 0.9999443292617798, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8098893761634827, 'clip_ratio/low_mean': 3.1679782978244475e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.505394312876888e-06, 'clip_ratio/high_max': 1.4606259583160863e-05, 'clip_ratio/region_mean': 3.7185177234277944e-05, 'epoch': 0.4} + + 42%|████▏ | 434/1024 [19:50:20<26:12:59, 159.97s/it]INFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:15:20 [block_pool.py:292] Successfully reset prefix cache + + 42%|████▏ | 435/1024 [19:52:57<26:00:14, 158.94s/it] + {'loss': 0.0877, 'grad_norm': 0.004721642471849918, 'learning_rate': 1e-05, 'num_tokens': 382070478.0, 'completions/mean_length': 5243.8203125, 'completions/min_length': 576.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5156.1025390625, 'completions/min_terminated_length': 576.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.6875, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.6875, 'reward_std': 0.26538965106010437, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016579966992139816, 'sampling/sampling_logp_difference/max': 6.7663984298706055, 'sampling/importance_sampling_ratio/min': 0.0011518355458974838, 'sampling/importance_sampling_ratio/mean': 0.9999414086341858, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7485036551952362, 'clip_ratio/low_mean': 2.3637440563106793e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.702175888520287e-06, 'clip_ratio/high_max': 1.4808703554081148e-05, 'clip_ratio/region_mean': 2.7339616224253405e-05, 'epoch': 0.4} + + 42%|████▏ | 435/1024 [19:52:57<26:00:14, 158.94s/it]INFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:17:56 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 436/1024 [19:55:35<25:56:19, 158.81s/it] + {'loss': 0.0342, 'grad_norm': 0.00329192029312253, 'learning_rate': 1e-05, 'num_tokens': 382990245.0, 'completions/mean_length': 7021.1796875, 'completions/min_length': 1371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6872.56396484375, 'completions/min_terminated_length': 1371.0, 'completions/max_terminated_length': 15978.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019863136112689972, 'sampling/sampling_logp_difference/max': 6.058165073394775, 'sampling/importance_sampling_ratio/min': 0.0023386883549392223, 'sampling/importance_sampling_ratio/mean': 0.9999822378158569, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8693460151553154, 'clip_ratio/low_mean': 3.602651599976525e-05, 'clip_ratio/low_min': 4.348733455117326e-06, 'clip_ratio/high_mean': 1.1174359769938746e-05, 'clip_ratio/high_max': 3.1177480195765384e-05, 'clip_ratio/region_mean': 4.720087713394605e-05, 'epoch': 0.4} + + 43%|████▎ | 436/1024 [19:55:35<25:56:19, 158.81s/it]INFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:20:35 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 437/1024 [19:58:32<26:45:53, 164.15s/it] + {'loss': 0.1009, 'grad_norm': 0.0051889242604374886, 'learning_rate': 1e-05, 'num_tokens': 383896717.0, 'completions/mean_length': 6917.625, 'completions/min_length': 945.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6452.0654296875, 'completions/min_terminated_length': 945.0, 'completions/max_terminated_length': 15344.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.3448137044906616, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019528398290276527, 'sampling/sampling_logp_difference/max': 8.749983787536621, 'sampling/importance_sampling_ratio/min': 0.00015846389578655362, 'sampling/importance_sampling_ratio/mean': 0.9999983310699463, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8466897681355476, 'clip_ratio/low_mean': 4.9078003257818636e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7981737389382033e-06, 'clip_ratio/high_max': 1.1192694955752813e-05, 'clip_ratio/region_mean': 5.1876177280973934e-05, 'epoch': 0.4} + + 43%|████▎ | 437/1024 [19:58:32<26:45:53, 164.15s/it]INFO 12-02 09:23:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:23:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:23:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:23:32 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 438/1024 [20:01:27<27:14:43, 167.38s/it] + {'loss': 0.0087, 'grad_norm': 0.002855573548004031, 'learning_rate': 1e-05, 'num_tokens': 384872622.0, 'completions/mean_length': 7487.5078125, 'completions/min_length': 877.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7346.2939453125, 'completions/min_terminated_length': 877.0, 'completions/max_terminated_length': 16175.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2477683424949646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0209865253418684, 'sampling/sampling_logp_difference/max': 5.557258605957031, 'sampling/importance_sampling_ratio/min': 0.0038593418430536985, 'sampling/importance_sampling_ratio/mean': 0.9999386668205261, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9584660083055496, 'clip_ratio/low_mean': 3.8556312347282073e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.263948757303297e-06, 'clip_ratio/high_max': 2.3224948108691024e-05, 'clip_ratio/region_mean': 4.682026019509067e-05, 'epoch': 0.4} + + 43%|████▎ | 438/1024 [20:01:27<27:14:43, 167.38s/it]INFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:26:26 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 439/1024 [20:04:04<26:43:19, 164.44s/it] + {'loss': 0.0523, 'grad_norm': 0.004437311552464962, 'learning_rate': 1e-05, 'num_tokens': 385744023.0, 'completions/mean_length': 6637.5078125, 'completions/min_length': 998.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6323.1044921875, 'completions/min_terminated_length': 998.0, 'completions/max_terminated_length': 16092.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2603819966316223, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019490888342261314, 'sampling/sampling_logp_difference/max': 5.834418296813965, 'sampling/importance_sampling_ratio/min': 0.002925124252215028, 'sampling/importance_sampling_ratio/mean': 0.9999136924743652, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8841215297579765, 'clip_ratio/low_mean': 2.98128834401723e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5429051245519076e-06, 'clip_ratio/high_max': 6.171620498207631e-06, 'clip_ratio/region_mean': 3.135578845103737e-05, 'epoch': 0.4} + + 43%|████▎ | 439/1024 [20:04:04<26:43:19, 164.44s/it]INFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:29:04 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 440/1024 [20:06:32<25:51:24, 159.39s/it] + {'loss': -0.0075, 'grad_norm': 0.002463799435645342, 'learning_rate': 1e-05, 'num_tokens': 386525492.0, 'completions/mean_length': 5965.9765625, 'completions/min_length': 621.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5800.611328125, 'completions/min_terminated_length': 621.0, 'completions/max_terminated_length': 15143.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.30457615852355957, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01946769654750824, 'sampling/sampling_logp_difference/max': 8.4989652633667, 'sampling/importance_sampling_ratio/min': 0.00020367901015561074, 'sampling/importance_sampling_ratio/mean': 0.9999351501464844, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8726934269070625, 'clip_ratio/low_mean': 5.443932013804442e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3262185752391815e-06, 'clip_ratio/high_max': 1.3304874300956726e-05, 'clip_ratio/region_mean': 5.776553894065728e-05, 'epoch': 0.4} + + 43%|████▎ | 440/1024 [20:06:32<25:51:24, 159.39s/it]INFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:31:32 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 441/1024 [20:09:06<25:32:57, 157.77s/it] + {'loss': 0.0415, 'grad_norm': 0.0038990566972643137, 'learning_rate': 1e-05, 'num_tokens': 387404842.0, 'completions/mean_length': 6693.109375, 'completions/min_length': 1704.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6616.80322265625, 'completions/min_terminated_length': 1704.0, 'completions/max_terminated_length': 16115.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.31587693095207214, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020848294720053673, 'sampling/sampling_logp_difference/max': 6.749990940093994, 'sampling/importance_sampling_ratio/min': 0.0011708902893587947, 'sampling/importance_sampling_ratio/mean': 0.9999700784683228, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9430640190839767, 'clip_ratio/low_mean': 3.598771945689805e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.6154040117253317e-06, 'clip_ratio/high_max': 1.0084711902891286e-05, 'clip_ratio/region_mean': 3.9603123695997056e-05, 'epoch': 0.41} + + 43%|████▎ | 441/1024 [20:09:06<25:32:57, 157.77s/it]INFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:34:06 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 442/1024 [20:12:01<26:20:08, 162.90s/it] + {'loss': 0.099, 'grad_norm': 0.0018510994268581271, 'learning_rate': 1e-05, 'num_tokens': 388324475.0, 'completions/mean_length': 7045.6953125, 'completions/min_length': 926.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6505.46240234375, 'completions/min_terminated_length': 926.0, 'completions/max_terminated_length': 16162.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.32195523381233215, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020547039806842804, 'sampling/sampling_logp_difference/max': 5.752217769622803, 'sampling/importance_sampling_ratio/min': 0.0031757301185280085, 'sampling/importance_sampling_ratio/mean': 0.9999024868011475, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8912066072225571, 'clip_ratio/low_mean': 5.234285907818048e-05, 'clip_ratio/low_min': 4.47803950009984e-06, 'clip_ratio/high_mean': 1.8656716065379442e-06, 'clip_ratio/high_max': 7.462686426151777e-06, 'clip_ratio/region_mean': 5.420853057103159e-05, 'epoch': 0.41} + + 43%|████▎ | 442/1024 [20:12:01<26:20:08, 162.90s/it]INFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:37:00 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 443/1024 [20:14:40<26:07:54, 161.92s/it] + {'loss': 0.061, 'grad_norm': 0.004439481534063816, 'learning_rate': 1e-05, 'num_tokens': 389305644.0, 'completions/mean_length': 7480.0078125, 'completions/min_length': 1130.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7266.3125, 'completions/min_terminated_length': 1130.0, 'completions/max_terminated_length': 15734.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.31300368905067444, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01973455585539341, 'sampling/sampling_logp_difference/max': 4.899544715881348, 'sampling/importance_sampling_ratio/min': 0.007449973840266466, 'sampling/importance_sampling_ratio/mean': 0.9999762773513794, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8813760280609131, 'clip_ratio/low_mean': 6.165269871871715e-05, 'clip_ratio/low_min': 3.5272871627967106e-06, 'clip_ratio/high_mean': 6.26131770786742e-06, 'clip_ratio/high_max': 2.504527083146968e-05, 'clip_ratio/region_mean': 6.791401551708987e-05, 'epoch': 0.41} + + 43%|████▎ | 443/1024 [20:14:40<26:07:54, 161.92s/it]INFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:39:40 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 444/1024 [20:17:11<25:32:19, 158.52s/it] + {'loss': -0.0068, 'grad_norm': 0.004181519150733948, 'learning_rate': 1e-05, 'num_tokens': 390229373.0, 'completions/mean_length': 7044.4453125, 'completions/min_length': 1229.0, 'completions/max_length': 15302.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7044.4453125, 'completions/min_terminated_length': 1229.0, 'completions/max_terminated_length': 15302.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.17700131237506866, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.021211043000221252, 'sampling/sampling_logp_difference/max': 8.397781372070312, 'sampling/importance_sampling_ratio/min': 0.00022536676260642707, 'sampling/importance_sampling_ratio/mean': 1.0000314712524414, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9901906549930573, 'clip_ratio/low_mean': 3.662567087303614e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0245229304928216e-06, 'clip_ratio/high_max': 4.0980917219712865e-06, 'clip_ratio/region_mean': 3.76501939172158e-05, 'epoch': 0.41} + + 43%|████▎ | 444/1024 [20:17:11<25:32:19, 158.52s/it]INFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:42:11 [block_pool.py:292] Successfully reset prefix cache + + 43%|████▎ | 445/1024 [20:20:09<26:26:12, 164.37s/it] + {'loss': 0.035, 'grad_norm': 0.002691390924155712, 'learning_rate': 1e-05, 'num_tokens': 391251141.0, 'completions/mean_length': 7815.8125, 'completions/min_length': 1350.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7244.6005859375, 'completions/min_terminated_length': 1350.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.31222954392433167, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018415704369544983, 'sampling/sampling_logp_difference/max': 4.864527702331543, 'sampling/importance_sampling_ratio/min': 0.007715471088886261, 'sampling/importance_sampling_ratio/mean': 0.99993896484375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8278292864561081, 'clip_ratio/low_mean': 5.29995777469594e-05, 'clip_ratio/low_min': 3.708758640641463e-06, 'clip_ratio/high_mean': 3.7274680266818905e-06, 'clip_ratio/high_max': 1.4909872106727562e-05, 'clip_ratio/region_mean': 5.672704537573736e-05, 'epoch': 0.41} + + 43%|████▎ | 445/1024 [20:20:09<26:26:12, 164.37s/it]INFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:45:09 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▎ | 446/1024 [20:22:25<25:01:21, 155.85s/it] + {'loss': 0.1153, 'grad_norm': 0.0069543467834591866, 'learning_rate': 1e-05, 'num_tokens': 391956196.0, 'completions/mean_length': 5305.1796875, 'completions/min_length': 1017.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5217.94482421875, 'completions/min_terminated_length': 1017.0, 'completions/max_terminated_length': 15202.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017318082973361015, 'sampling/sampling_logp_difference/max': 5.996687889099121, 'sampling/importance_sampling_ratio/min': 0.0024869756307452917, 'sampling/importance_sampling_ratio/mean': 1.0000190734863281, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8100772425532341, 'clip_ratio/low_mean': 3.196247394043894e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.629899417021079e-06, 'clip_ratio/high_max': 2.1858722902834415e-05, 'clip_ratio/region_mean': 3.859237290271267e-05, 'epoch': 0.41} + + 44%|████▎ | 446/1024 [20:22:25<25:01:21, 155.85s/it]INFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:47:25 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▎ | 447/1024 [20:25:12<25:32:09, 159.32s/it] + {'loss': 0.0883, 'grad_norm': 0.0065611582249403, 'learning_rate': 1e-05, 'num_tokens': 392908430.0, 'completions/mean_length': 7299.578125, 'completions/min_length': 1008.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6930.29248046875, 'completions/min_terminated_length': 1008.0, 'completions/max_terminated_length': 15300.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.304571270942688, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02127375639975071, 'sampling/sampling_logp_difference/max': 11.873339653015137, 'sampling/importance_sampling_ratio/min': 6.9738744059577584e-06, 'sampling/importance_sampling_ratio/mean': 0.9999696016311646, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9955824315547943, 'clip_ratio/low_mean': 5.289376917971822e-05, 'clip_ratio/low_min': 4.21926688431995e-06, 'clip_ratio/high_mean': 8.056288947955181e-06, 'clip_ratio/high_max': 2.461934036546154e-05, 'clip_ratio/region_mean': 6.0950058468733914e-05, 'epoch': 0.41} + + 44%|████▎ | 447/1024 [20:25:12<25:32:09, 159.32s/it]INFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:50:12 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▍ | 448/1024 [20:28:04<26:04:47, 163.00s/it] + {'loss': 0.0725, 'grad_norm': 0.0032975098583847284, 'learning_rate': 1e-05, 'num_tokens': 393788286.0, 'completions/mean_length': 6702.9375, 'completions/min_length': 469.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6390.64501953125, 'completions/min_terminated_length': 469.0, 'completions/max_terminated_length': 16221.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.27168765664100647, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019461583346128464, 'sampling/sampling_logp_difference/max': 8.160128593444824, 'sampling/importance_sampling_ratio/min': 0.00028582560480572283, 'sampling/importance_sampling_ratio/mean': 0.9999115467071533, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.82919991761446, 'clip_ratio/low_mean': 3.89272447591793e-05, 'clip_ratio/low_min': 4.047796210215893e-06, 'clip_ratio/high_mean': 7.412756531266496e-06, 'clip_ratio/high_max': 2.4339562514796853e-05, 'clip_ratio/region_mean': 4.6340001517819474e-05, 'epoch': 0.41} + + 44%|████▍ | 448/1024 [20:28:04<26:04:47, 163.00s/it]INFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:53:06 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 44%|████▍ | 449/1024 [20:31:27<27:55:34, 174.84s/it] + {'loss': 0.1149, 'grad_norm': 0.0032787907402962446, 'learning_rate': 1e-05, 'num_tokens': 394638159.0, 'completions/mean_length': 6468.9453125, 'completions/min_length': 808.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 5536.7607421875, 'completions/min_terminated_length': 808.0, 'completions/max_terminated_length': 15244.0, 'rewards/accuracy_reward/mean': 0.625, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.625, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016151495277881622, 'sampling/sampling_logp_difference/max': 8.999967575073242, 'sampling/importance_sampling_ratio/min': 0.00012341380352154374, 'sampling/importance_sampling_ratio/mean': 0.9999669790267944, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6471721827983856, 'clip_ratio/low_mean': 3.195798365140945e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.951899765932467e-06, 'clip_ratio/high_max': 2.3807599063729867e-05, 'clip_ratio/region_mean': 3.790988330365508e-05, 'epoch': 0.41} + + 44%|████▍ | 449/1024 [20:31:27<27:55:34, 174.84s/it]INFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:56:26 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▍ | 450/1024 [20:34:11<27:22:28, 171.69s/it] + {'loss': 0.0967, 'grad_norm': 0.0038375966250896454, 'learning_rate': 1e-05, 'num_tokens': 395493872.0, 'completions/mean_length': 6547.3203125, 'completions/min_length': 587.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6230.0078125, 'completions/min_terminated_length': 587.0, 'completions/max_terminated_length': 15931.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.30798619985580444, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019957344979047775, 'sampling/sampling_logp_difference/max': 8.739748001098633, 'sampling/importance_sampling_ratio/min': 0.00016009423416107893, 'sampling/importance_sampling_ratio/mean': 0.9999747276306152, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9123960956931114, 'clip_ratio/low_mean': 6.035319393049576e-05, 'clip_ratio/low_min': 4.063190772285452e-06, 'clip_ratio/high_mean': 5.61768172246957e-06, 'clip_ratio/high_max': 2.247072688987828e-05, 'clip_ratio/region_mean': 6.597087667614687e-05, 'epoch': 0.41} + + 44%|████▍ | 450/1024 [20:34:11<27:22:28, 171.69s/it]INFO 12-02 09:59:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:59:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:59:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 09:59:11 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▍ | 451/1024 [20:36:53<26:51:44, 168.77s/it] + {'loss': 0.0656, 'grad_norm': 0.003903903067111969, 'learning_rate': 1e-05, 'num_tokens': 396320254.0, 'completions/mean_length': 6291.859375, 'completions/min_length': 823.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6131.6669921875, 'completions/min_terminated_length': 823.0, 'completions/max_terminated_length': 15058.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2569621503353119, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020753150805830956, 'sampling/sampling_logp_difference/max': 11.93381404876709, 'sampling/importance_sampling_ratio/min': 6.564632712979801e-06, 'sampling/importance_sampling_ratio/mean': 0.9999452829360962, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9841655194759369, 'clip_ratio/low_mean': 2.315102483407827e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.5112059322273126e-06, 'clip_ratio/high_max': 1.404482372890925e-05, 'clip_ratio/region_mean': 2.6662230766305584e-05, 'epoch': 0.41} + + 44%|████▍ | 451/1024 [20:36:53<26:51:44, 168.77s/it]INFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:01:52 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▍ | 452/1024 [20:39:55<27:27:47, 172.84s/it] + {'loss': 0.0511, 'grad_norm': 0.005152889993041754, 'learning_rate': 1e-05, 'num_tokens': 397327029.0, 'completions/mean_length': 7692.4296875, 'completions/min_length': 1269.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7339.11376953125, 'completions/min_terminated_length': 1269.0, 'completions/max_terminated_length': 15966.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02036213129758835, 'sampling/sampling_logp_difference/max': 9.897988319396973, 'sampling/importance_sampling_ratio/min': 5.027571751270443e-05, 'sampling/importance_sampling_ratio/mean': 0.9999433755874634, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.94080401211977, 'clip_ratio/low_mean': 3.547988831087423e-05, 'clip_ratio/low_min': 3.3967392027989263e-06, 'clip_ratio/high_mean': 4.615214265868417e-06, 'clip_ratio/high_max': 1.5189204987109406e-05, 'clip_ratio/region_mean': 4.009510257674265e-05, 'epoch': 0.42} + + 44%|████▍ | 452/1024 [20:39:55<27:27:47, 172.84s/it]INFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:04:55 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▍ | 453/1024 [20:42:43<27:09:15, 171.20s/it] + {'loss': 0.0182, 'grad_norm': 0.0035838852636516094, 'learning_rate': 1e-05, 'num_tokens': 398237536.0, 'completions/mean_length': 6968.0859375, 'completions/min_length': 893.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6742.1044921875, 'completions/min_terminated_length': 893.0, 'completions/max_terminated_length': 15305.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020200733095407486, 'sampling/sampling_logp_difference/max': 6.030359745025635, 'sampling/importance_sampling_ratio/min': 0.002404628787189722, 'sampling/importance_sampling_ratio/mean': 1.0000159740447998, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9254838973283768, 'clip_ratio/low_mean': 2.335082047011383e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.0586507970583625e-06, 'clip_ratio/high_max': 1.733157705530175e-05, 'clip_ratio/region_mean': 2.9409470812424843e-05, 'epoch': 0.42} + + 44%|████▍ | 453/1024 [20:42:43<27:09:15, 171.20s/it]INFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:07:42 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▍ | 454/1024 [20:45:54<28:02:50, 177.14s/it] + {'loss': 0.0412, 'grad_norm': 0.0036290446296334267, 'learning_rate': 1e-05, 'num_tokens': 399373298.0, 'completions/mean_length': 8711.078125, 'completions/min_length': 1049.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 8199.55078125, 'completions/min_terminated_length': 1049.0, 'completions/max_terminated_length': 16309.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.19568344950675964, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0201371181756258, 'sampling/sampling_logp_difference/max': 9.291923522949219, 'sampling/importance_sampling_ratio/min': 9.216561011271551e-05, 'sampling/importance_sampling_ratio/mean': 1.000042200088501, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8735406622290611, 'clip_ratio/low_mean': 3.311113533754906e-05, 'clip_ratio/low_min': 6.725854291289579e-06, 'clip_ratio/high_mean': 1.116230919251393e-06, 'clip_ratio/high_max': 4.464923677005572e-06, 'clip_ratio/region_mean': 3.422736637048729e-05, 'epoch': 0.42} + + 44%|████▍ | 454/1024 [20:45:54<28:02:50, 177.14s/it]INFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:10:53 [block_pool.py:292] Successfully reset prefix cache + + 44%|████▍ | 455/1024 [20:48:28<26:56:16, 170.43s/it] + {'loss': 0.0633, 'grad_norm': 0.004067540634423494, 'learning_rate': 1e-05, 'num_tokens': 400273708.0, 'completions/mean_length': 6891.078125, 'completions/min_length': 827.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6663.24853515625, 'completions/min_terminated_length': 827.0, 'completions/max_terminated_length': 14737.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.27274850010871887, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019800148904323578, 'sampling/sampling_logp_difference/max': 14.731733322143555, 'sampling/importance_sampling_ratio/min': 4.0002717582865444e-07, 'sampling/importance_sampling_ratio/mean': 0.9999425411224365, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8689641878008842, 'clip_ratio/low_mean': 3.3217100849469716e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.675666151702899e-06, 'clip_ratio/high_max': 3.4702664606811595e-05, 'clip_ratio/region_mean': 4.189276808119757e-05, 'epoch': 0.42} + + 44%|████▍ | 455/1024 [20:48:28<26:56:16, 170.43s/it]INFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:13:28 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▍ | 456/1024 [20:51:10<26:28:46, 167.83s/it] + {'loss': 0.0743, 'grad_norm': 0.0026191689539700747, 'learning_rate': 1e-05, 'num_tokens': 401177497.0, 'completions/mean_length': 6899.3515625, 'completions/min_length': 1149.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6748.8017578125, 'completions/min_terminated_length': 1149.0, 'completions/max_terminated_length': 15234.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.20251333713531494, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021380646154284477, 'sampling/sampling_logp_difference/max': 6.3249406814575195, 'sampling/importance_sampling_ratio/min': 0.0017910725437104702, 'sampling/importance_sampling_ratio/mean': 0.9999812841415405, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9442604705691338, 'clip_ratio/low_mean': 3.564125790944672e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.347927066803095e-07, 'clip_ratio/high_max': 2.939170826721238e-06, 'clip_ratio/region_mean': 3.6376050502440194e-05, 'epoch': 0.42} + + 45%|████▍ | 456/1024 [20:51:10<26:28:46, 167.83s/it]INFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:16:10 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-02 10:18:20,061 - math_verify.grader - WARNING - Timeout during comparison + + 45%|████▍ | 457/1024 [20:54:26<27:45:22, 176.23s/it] + {'loss': 0.0674, 'grad_norm': 0.003141516586765647, 'learning_rate': 1e-05, 'num_tokens': 402115812.0, 'completions/mean_length': 7175.8359375, 'completions/min_length': 919.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7029.6748046875, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 16226.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.21040895581245422, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01970163732767105, 'sampling/sampling_logp_difference/max': 6.672667980194092, 'sampling/importance_sampling_ratio/min': 0.001265019178390503, 'sampling/importance_sampling_ratio/mean': 0.9999862909317017, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8653769046068192, 'clip_ratio/low_mean': 2.57235833487357e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.24901032197522e-06, 'clip_ratio/high_max': 8.99604128790088e-06, 'clip_ratio/region_mean': 2.797259367071092e-05, 'epoch': 0.42} + + 45%|████▍ | 457/1024 [20:54:26<27:45:22, 176.23s/it]INFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:19:26 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▍ | 458/1024 [20:57:20<27:37:15, 175.68s/it] + {'loss': 0.0751, 'grad_norm': 0.001980370609089732, 'learning_rate': 1e-05, 'num_tokens': 403048385.0, 'completions/mean_length': 7090.8515625, 'completions/min_length': 606.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6791.072265625, 'completions/min_terminated_length': 606.0, 'completions/max_terminated_length': 16250.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.021090596914291382, 'sampling/sampling_logp_difference/max': 13.47822093963623, 'sampling/importance_sampling_ratio/min': 1.4011449138706666e-06, 'sampling/importance_sampling_ratio/mean': 0.9999619722366333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9437825232744217, 'clip_ratio/low_mean': 3.116219727417047e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.700014874790213e-06, 'clip_ratio/high_max': 1.0800059499160852e-05, 'clip_ratio/region_mean': 3.3862211807900167e-05, 'epoch': 0.42} + + 45%|████▍ | 458/1024 [20:57:20<27:37:15, 175.68s/it]INFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:22:20 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▍ | 459/1024 [20:59:57<26:42:06, 170.14s/it] + {'loss': 0.059, 'grad_norm': 0.003833206370472908, 'learning_rate': 1e-05, 'num_tokens': 403968037.0, 'completions/mean_length': 7033.65625, 'completions/min_length': 1007.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6809.24853515625, 'completions/min_terminated_length': 1007.0, 'completions/max_terminated_length': 16175.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.28460076451301575, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019913772121071815, 'sampling/sampling_logp_difference/max': 6.1218976974487305, 'sampling/importance_sampling_ratio/min': 0.0021942879538983107, 'sampling/importance_sampling_ratio/mean': 1.0000317096710205, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8789731040596962, 'clip_ratio/low_mean': 4.8558076969129615e-05, 'clip_ratio/low_min': 4.8952420002024155e-06, 'clip_ratio/high_mean': 6.370712640091369e-06, 'clip_ratio/high_max': 2.5482850560365478e-05, 'clip_ratio/region_mean': 5.4928788131292094e-05, 'epoch': 0.42} + + 45%|████▍ | 459/1024 [20:59:58<26:42:06, 170.14s/it]INFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:24:57 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▍ | 460/1024 [21:02:57<27:06:18, 173.01s/it] + {'loss': 0.1581, 'grad_norm': 0.005315023008733988, 'learning_rate': 1e-05, 'num_tokens': 404881584.0, 'completions/mean_length': 6992.8984375, 'completions/min_length': 754.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6611.14599609375, 'completions/min_terminated_length': 754.0, 'completions/max_terminated_length': 16107.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3469353914260864, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01872519962489605, 'sampling/sampling_logp_difference/max': 9.998538970947266, 'sampling/importance_sampling_ratio/min': 4.546630952972919e-05, 'sampling/importance_sampling_ratio/mean': 1.0000758171081543, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.857115626335144, 'clip_ratio/low_mean': 6.774969961043098e-05, 'clip_ratio/low_min': 3.189914878021227e-06, 'clip_ratio/high_mean': 1.0172194606639096e-06, 'clip_ratio/high_max': 4.068877842655638e-06, 'clip_ratio/region_mean': 6.876691895740805e-05, 'epoch': 0.42} + + 45%|████▍ | 460/1024 [21:02:57<27:06:18, 173.01s/it]INFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:27:57 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▌ | 461/1024 [21:05:39<26:31:46, 169.64s/it] + {'loss': 0.1076, 'grad_norm': 0.0074885934591293335, 'learning_rate': 1e-05, 'num_tokens': 405749105.0, 'completions/mean_length': 6623.2578125, 'completions/min_length': 221.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6226.4794921875, 'completions/min_terminated_length': 221.0, 'completions/max_terminated_length': 16095.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.25354722142219543, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01930626854300499, 'sampling/sampling_logp_difference/max': 6.748711109161377, 'sampling/importance_sampling_ratio/min': 0.0011723897187039256, 'sampling/importance_sampling_ratio/mean': 0.9999799728393555, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8803941905498505, 'clip_ratio/low_mean': 3.3195502112448594e-05, 'clip_ratio/low_min': 5.25188033861923e-06, 'clip_ratio/high_mean': 2.9176186444601626e-06, 'clip_ratio/high_max': 1.167047457784065e-05, 'clip_ratio/region_mean': 3.611312064322192e-05, 'epoch': 0.42} + + 45%|████▌ | 461/1024 [21:05:39<26:31:46, 169.64s/it]INFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:30:39 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▌ | 462/1024 [21:08:19<26:02:32, 166.82s/it] + {'loss': 0.0536, 'grad_norm': 0.003960717935115099, 'learning_rate': 1e-05, 'num_tokens': 406704618.0, 'completions/mean_length': 7244.8203125, 'completions/min_length': 1227.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6647.5419921875, 'completions/min_terminated_length': 1227.0, 'completions/max_terminated_length': 15032.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2880108058452606, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02019711770117283, 'sampling/sampling_logp_difference/max': 10.98397159576416, 'sampling/importance_sampling_ratio/min': 1.69715603988152e-05, 'sampling/importance_sampling_ratio/mean': 0.9999812841415405, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9202689751982689, 'clip_ratio/low_mean': 5.09268712676203e-05, 'clip_ratio/low_min': 1.1170248626513057e-05, 'clip_ratio/high_mean': 1.0293827017449075e-06, 'clip_ratio/high_max': 4.11753080697963e-06, 'clip_ratio/region_mean': 5.195625465148623e-05, 'epoch': 0.43} + + 45%|████▌ | 462/1024 [21:08:19<26:02:32, 166.82s/it]INFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:33:19 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▌ | 463/1024 [21:11:02<25:47:14, 165.48s/it] + {'loss': 0.1054, 'grad_norm': 0.003602087963372469, 'learning_rate': 1e-05, 'num_tokens': 407677177.0, 'completions/mean_length': 7462.0546875, 'completions/min_length': 669.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6867.2587890625, 'completions/min_terminated_length': 669.0, 'completions/max_terminated_length': 16296.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.35482609272003174, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01856713369488716, 'sampling/sampling_logp_difference/max': 7.155362129211426, 'sampling/importance_sampling_ratio/min': 0.0007806668290868402, 'sampling/importance_sampling_ratio/mean': 0.9999440312385559, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8141553401947021, 'clip_ratio/low_mean': 5.367962035052187e-05, 'clip_ratio/low_min': 6.5083827394119e-06, 'clip_ratio/high_mean': 1.0519701334033016e-05, 'clip_ratio/high_max': 2.874629831239872e-05, 'clip_ratio/region_mean': 6.419932219614566e-05, 'epoch': 0.43} + + 45%|████▌ | 463/1024 [21:11:02<25:47:14, 165.48s/it]INFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:36:01 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▌ | 464/1024 [21:13:44<25:35:07, 164.48s/it] + {'loss': 0.061, 'grad_norm': 0.004038481041789055, 'learning_rate': 1e-05, 'num_tokens': 408552512.0, 'completions/mean_length': 6683.1796875, 'completions/min_length': 775.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6529.19873046875, 'completions/min_terminated_length': 775.0, 'completions/max_terminated_length': 15750.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.25620076060295105, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02077356167137623, 'sampling/sampling_logp_difference/max': 10.014501571655273, 'sampling/importance_sampling_ratio/min': 4.474630986806005e-05, 'sampling/importance_sampling_ratio/mean': 1.0000439882278442, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9070071652531624, 'clip_ratio/low_mean': 3.5997712757307454e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.973188073468918e-06, 'clip_ratio/high_max': 2.6413443720230134e-05, 'clip_ratio/region_mean': 4.497090230870526e-05, 'epoch': 0.43} + + 45%|████▌ | 464/1024 [21:13:44<25:35:07, 164.48s/it]INFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:38:43 [block_pool.py:292] Successfully reset prefix cache + + 45%|████▌ | 465/1024 [21:16:21<25:12:34, 162.35s/it] + {'loss': 0.0295, 'grad_norm': 0.004457853268831968, 'learning_rate': 1e-05, 'num_tokens': 409399257.0, 'completions/mean_length': 6472.9453125, 'completions/min_length': 783.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5985.51611328125, 'completions/min_terminated_length': 783.0, 'completions/max_terminated_length': 15864.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.20517179369926453, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020475786179304123, 'sampling/sampling_logp_difference/max': 6.343741416931152, 'sampling/importance_sampling_ratio/min': 0.0017577135004103184, 'sampling/importance_sampling_ratio/mean': 0.9999473690986633, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8807859197258949, 'clip_ratio/low_mean': 3.225401701456576e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.292822495699511e-06, 'clip_ratio/high_max': 1.7171289982798044e-05, 'clip_ratio/region_mean': 3.654683996501262e-05, 'epoch': 0.43} + + 45%|████▌ | 465/1024 [21:16:21<25:12:34, 162.35s/it]INFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:41:21 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 466/1024 [21:18:57<24:52:25, 160.48s/it] + {'loss': 0.1217, 'grad_norm': 0.0033953245729207993, 'learning_rate': 1e-05, 'num_tokens': 410185645.0, 'completions/mean_length': 5989.78125, 'completions/min_length': 610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5654.48388671875, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 15896.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3735082745552063, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.017986344173550606, 'sampling/sampling_logp_difference/max': 10.935420036315918, 'sampling/importance_sampling_ratio/min': 1.781588616722729e-05, 'sampling/importance_sampling_ratio/mean': 0.9999676942825317, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8479711338877678, 'clip_ratio/low_mean': 5.706528349946893e-05, 'clip_ratio/low_min': 2.5156462925224332e-05, 'clip_ratio/high_mean': 1.584139977239829e-05, 'clip_ratio/high_max': 5.442162637336878e-05, 'clip_ratio/region_mean': 7.290668463610928e-05, 'epoch': 0.43} + + 46%|████▌ | 466/1024 [21:18:57<24:52:25, 160.48s/it]INFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:43:57 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 467/1024 [21:22:03<26:01:27, 168.20s/it] + {'loss': 0.0651, 'grad_norm': 0.002381247701123357, 'learning_rate': 1e-05, 'num_tokens': 411268974.0, 'completions/mean_length': 8299.9453125, 'completions/min_length': 1123.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 8171.62744140625, 'completions/min_terminated_length': 1123.0, 'completions/max_terminated_length': 16103.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2477683573961258, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021354343742132187, 'sampling/sampling_logp_difference/max': 7.4999823570251465, 'sampling/importance_sampling_ratio/min': 0.000553094083443284, 'sampling/importance_sampling_ratio/mean': 0.9999679327011108, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9363152608275414, 'clip_ratio/low_mean': 5.2673244681500364e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.2673244681500364e-05, 'epoch': 0.43} + + 46%|████▌ | 467/1024 [21:22:03<26:01:27, 168.20s/it]INFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:47:03 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 468/1024 [21:24:56<26:11:28, 169.58s/it] + {'loss': -0.003, 'grad_norm': 0.006341467145830393, 'learning_rate': 1e-05, 'num_tokens': 412238117.0, 'completions/mean_length': 7434.0546875, 'completions/min_length': 898.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7219.25634765625, 'completions/min_terminated_length': 898.0, 'completions/max_terminated_length': 14838.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02139873616397381, 'sampling/sampling_logp_difference/max': 6.249992847442627, 'sampling/importance_sampling_ratio/min': 0.0019304680172353983, 'sampling/importance_sampling_ratio/mean': 1.0000128746032715, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.981913685798645, 'clip_ratio/low_mean': 2.84454882830687e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1446739992825314e-06, 'clip_ratio/high_max': 8.578695997130126e-06, 'clip_ratio/region_mean': 3.059016239603807e-05, 'epoch': 0.43} + + 46%|████▌ | 468/1024 [21:24:56<26:11:28, 169.58s/it]INFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:49:56 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 469/1024 [21:27:44<26:03:18, 169.01s/it] + {'loss': 0.0562, 'grad_norm': 0.002621602965518832, 'learning_rate': 1e-05, 'num_tokens': 413182860.0, 'completions/mean_length': 7211.1796875, 'completions/min_length': 280.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7138.95263671875, 'completions/min_terminated_length': 280.0, 'completions/max_terminated_length': 15871.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.34716784954071045, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020250719040632248, 'sampling/sampling_logp_difference/max': 9.874974250793457, 'sampling/importance_sampling_ratio/min': 5.1446182624204084e-05, 'sampling/importance_sampling_ratio/mean': 0.9999529123306274, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9307222217321396, 'clip_ratio/low_mean': 5.4699471832009294e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.150076049176278e-06, 'clip_ratio/high_max': 1.7187987396027893e-05, 'clip_ratio/region_mean': 5.9849548279089504e-05, 'epoch': 0.43} + + 46%|████▌ | 469/1024 [21:27:44<26:03:18, 169.01s/it]INFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:52:44 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 470/1024 [21:30:05<24:43:50, 160.70s/it] + {'loss': 0.0657, 'grad_norm': 0.0035241330042481422, 'learning_rate': 1e-05, 'num_tokens': 413885963.0, 'completions/mean_length': 5349.4296875, 'completions/min_length': 983.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5174.2783203125, 'completions/min_terminated_length': 983.0, 'completions/max_terminated_length': 15726.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.25330984592437744, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01975759118795395, 'sampling/sampling_logp_difference/max': 7.938032150268555, 'sampling/importance_sampling_ratio/min': 0.0003569081309251487, 'sampling/importance_sampling_ratio/mean': 0.9999449253082275, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0213474333286285, 'clip_ratio/low_mean': 4.7740833792886406e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9217885614561965e-06, 'clip_ratio/high_max': 1.0867412584047997e-05, 'clip_ratio/region_mean': 5.16626223543426e-05, 'epoch': 0.43} + + 46%|████▌ | 470/1024 [21:30:05<24:43:50, 160.70s/it]INFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:55:05 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 471/1024 [21:32:57<25:11:18, 163.97s/it] + {'loss': 0.0635, 'grad_norm': 0.0014164346503093839, 'learning_rate': 1e-05, 'num_tokens': 414870560.0, 'completions/mean_length': 7542.8515625, 'completions/min_length': 1359.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7257.65283203125, 'completions/min_terminated_length': 1359.0, 'completions/max_terminated_length': 15357.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.20753081142902374, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020874422043561935, 'sampling/sampling_logp_difference/max': 9.651104927062988, 'sampling/importance_sampling_ratio/min': 6.435441900976002e-05, 'sampling/importance_sampling_ratio/mean': 1.0000402927398682, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8882969543337822, 'clip_ratio/low_mean': 2.699725871480041e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.673786522995215e-06, 'clip_ratio/high_max': 1.469514609198086e-05, 'clip_ratio/region_mean': 3.0671045237795624e-05, 'epoch': 0.43} + + 46%|████▌ | 471/1024 [21:32:57<25:11:18, 163.97s/it]INFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 10:57:56 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 472/1024 [21:35:49<25:30:47, 166.39s/it] + {'loss': 0.0567, 'grad_norm': 0.0026956009678542614, 'learning_rate': 1e-05, 'num_tokens': 415825252.0, 'completions/mean_length': 7286.90625, 'completions/min_length': 977.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6993.451171875, 'completions/min_terminated_length': 977.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.0202642735093832, 'sampling/sampling_logp_difference/max': 6.229649543762207, 'sampling/importance_sampling_ratio/min': 0.0019701423589140177, 'sampling/importance_sampling_ratio/mean': 0.999917209148407, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9254636988043785, 'clip_ratio/low_mean': 3.673103901746799e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.17456874401978e-06, 'clip_ratio/high_max': 1.669827497607912e-05, 'clip_ratio/region_mean': 4.090560787517461e-05, 'epoch': 0.43} + + 46%|████▌ | 472/1024 [21:35:49<25:30:47, 166.39s/it]INFO 12-02 11:00:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:00:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:00:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:00:49 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▌ | 473/1024 [21:38:52<26:13:32, 171.35s/it] + {'loss': 0.0577, 'grad_norm': 0.0022128887940198183, 'learning_rate': 1e-05, 'num_tokens': 416774011.0, 'completions/mean_length': 7244.7421875, 'completions/min_length': 1010.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6716.0244140625, 'completions/min_terminated_length': 1010.0, 'completions/max_terminated_length': 15908.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2937847375869751, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01840684749186039, 'sampling/sampling_logp_difference/max': 6.499997138977051, 'sampling/importance_sampling_ratio/min': 0.0015034435782581568, 'sampling/importance_sampling_ratio/mean': 1.000002384185791, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7817923128604889, 'clip_ratio/low_mean': 3.818475033767754e-05, 'clip_ratio/low_min': 7.20606476534158e-06, 'clip_ratio/high_mean': 2.2905113610249828e-06, 'clip_ratio/high_max': 9.162045444099931e-06, 'clip_ratio/region_mean': 4.047526181238936e-05, 'epoch': 0.44} + + 46%|████▌ | 473/1024 [21:38:52<26:13:32, 171.35s/it]INFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:03:51 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▋ | 474/1024 [21:42:07<27:17:09, 178.60s/it] + {'loss': 0.0504, 'grad_norm': 0.0034676652867347, 'learning_rate': 1e-05, 'num_tokens': 417951311.0, 'completions/mean_length': 9042.90625, 'completions/min_length': 997.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 8283.482421875, 'completions/min_terminated_length': 997.0, 'completions/max_terminated_length': 16254.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02112819254398346, 'sampling/sampling_logp_difference/max': 8.239109992980957, 'sampling/importance_sampling_ratio/min': 0.0002641192404553294, 'sampling/importance_sampling_ratio/mean': 0.9999234080314636, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9306210279464722, 'clip_ratio/low_mean': 3.636896872194484e-05, 'clip_ratio/low_min': 3.1460788250115e-06, 'clip_ratio/high_mean': 3.0582178283111716e-06, 'clip_ratio/high_max': 1.2232871313244686e-05, 'clip_ratio/region_mean': 3.9427186266038916e-05, 'epoch': 0.44} + + 46%|████▋ | 474/1024 [21:42:07<27:17:09, 178.60s/it]INFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:07:07 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▋ | 475/1024 [21:44:40<26:03:40, 170.89s/it] + {'loss': 0.0704, 'grad_norm': 0.0030218157917261124, 'learning_rate': 1e-05, 'num_tokens': 418836184.0, 'completions/mean_length': 6763.6328125, 'completions/min_length': 826.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6610.9287109375, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 15721.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.30091896653175354, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021101050078868866, 'sampling/sampling_logp_difference/max': 7.880997180938721, 'sampling/importance_sampling_ratio/min': 0.0003778560785576701, 'sampling/importance_sampling_ratio/mean': 0.9999898672103882, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9879302233457565, 'clip_ratio/low_mean': 4.3606626604741905e-05, 'clip_ratio/low_min': 3.5752079838857753e-06, 'clip_ratio/high_mean': 8.202394610634656e-06, 'clip_ratio/high_max': 2.5187824576278217e-05, 'clip_ratio/region_mean': 5.1809020988002885e-05, 'epoch': 0.44} + + 46%|████▋ | 475/1024 [21:44:40<26:03:40, 170.89s/it]INFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:09:40 [block_pool.py:292] Successfully reset prefix cache + + 46%|████▋ | 476/1024 [21:47:20<25:31:16, 167.66s/it] + {'loss': 0.0592, 'grad_norm': 0.002881827764213085, 'learning_rate': 1e-05, 'num_tokens': 419726192.0, 'completions/mean_length': 6794.25, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6564.09619140625, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 15675.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.0210823193192482, 'sampling/sampling_logp_difference/max': 13.897041320800781, 'sampling/importance_sampling_ratio/min': 9.217044407705544e-07, 'sampling/importance_sampling_ratio/mean': 0.9999275207519531, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0259138569235802, 'clip_ratio/low_mean': 6.21261324340594e-05, 'clip_ratio/low_min': 3.6509140954876784e-06, 'clip_ratio/high_mean': 2.6610464374243747e-06, 'clip_ratio/high_max': 1.0644185749697499e-05, 'clip_ratio/region_mean': 6.478717887148377e-05, 'epoch': 0.44} + + 46%|████▋ | 476/1024 [21:47:20<25:31:16, 167.66s/it]INFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:12:20 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 477/1024 [21:49:42<24:18:07, 159.94s/it] + {'loss': 0.0484, 'grad_norm': 0.00289533962495625, 'learning_rate': 1e-05, 'num_tokens': 420468867.0, 'completions/mean_length': 5648.2109375, 'completions/min_length': 935.0, 'completions/max_length': 14281.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5648.2109375, 'completions/min_terminated_length': 935.0, 'completions/max_terminated_length': 14281.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2675113081932068, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018499158322811127, 'sampling/sampling_logp_difference/max': 6.590811729431152, 'sampling/importance_sampling_ratio/min': 0.001372925122268498, 'sampling/importance_sampling_ratio/mean': 0.9998449087142944, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.88894472271204, 'clip_ratio/low_mean': 4.70996876629215e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.7721512196876574e-06, 'clip_ratio/high_max': 1.108860487875063e-05, 'clip_ratio/region_mean': 4.9871839337356505e-05, 'epoch': 0.44} + + 47%|████▋ | 477/1024 [21:49:42<24:18:07, 159.94s/it]INFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:14:42 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 478/1024 [21:52:34<24:46:33, 163.36s/it] + {'loss': 0.0012, 'grad_norm': 0.002749695209786296, 'learning_rate': 1e-05, 'num_tokens': 421280881.0, 'completions/mean_length': 6188.359375, 'completions/min_length': 1085.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6026.52392578125, 'completions/min_terminated_length': 1085.0, 'completions/max_terminated_length': 15657.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.15991678833961487, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.018456483259797096, 'sampling/sampling_logp_difference/max': 5.386401653289795, 'sampling/importance_sampling_ratio/min': 0.004578418098390102, 'sampling/importance_sampling_ratio/mean': 0.9999796152114868, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8476063013076782, 'clip_ratio/low_mean': 2.4103785335682915e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1883936394951888e-06, 'clip_ratio/high_max': 4.753574557980755e-06, 'clip_ratio/region_mean': 2.5292179316238617e-05, 'epoch': 0.44} + + 47%|████▋ | 478/1024 [21:52:34<24:46:33, 163.36s/it]INFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:17:33 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 479/1024 [21:55:21<24:55:26, 164.64s/it] + {'loss': 0.0347, 'grad_norm': 0.005116373300552368, 'learning_rate': 1e-05, 'num_tokens': 422177822.0, 'completions/mean_length': 6864.3515625, 'completions/min_length': 1065.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6635.88037109375, 'completions/min_terminated_length': 1065.0, 'completions/max_terminated_length': 15112.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.01979806460440159, 'sampling/sampling_logp_difference/max': 8.498090744018555, 'sampling/importance_sampling_ratio/min': 0.00020385721290949732, 'sampling/importance_sampling_ratio/mean': 0.9999545216560364, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8666203916072845, 'clip_ratio/low_mean': 4.786080125995795e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0339978757656354e-05, 'clip_ratio/high_max': 4.1359915030625416e-05, 'clip_ratio/region_mean': 5.8200780586048495e-05, 'epoch': 0.44} + + 47%|████▋ | 479/1024 [21:55:21<24:55:26, 164.64s/it]INFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:20:21 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 480/1024 [21:58:20<25:30:29, 168.80s/it] + {'loss': 0.019, 'grad_norm': 0.0020944855641573668, 'learning_rate': 1e-05, 'num_tokens': 423096576.0, 'completions/mean_length': 7023.828125, 'completions/min_length': 780.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6799.18408203125, 'completions/min_terminated_length': 780.0, 'completions/max_terminated_length': 15841.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.20858672261238098, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020111342892050743, 'sampling/sampling_logp_difference/max': 5.900396347045898, 'sampling/importance_sampling_ratio/min': 0.0027383591514080763, 'sampling/importance_sampling_ratio/mean': 0.9999480247497559, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9098334684967995, 'clip_ratio/low_mean': 4.153812756158004e-05, 'clip_ratio/low_min': 3.606462769312202e-06, 'clip_ratio/high_mean': 3.6361936395223893e-06, 'clip_ratio/high_max': 1.4544774558089557e-05, 'clip_ratio/region_mean': 4.51743208031985e-05, 'epoch': 0.44} + + 47%|████▋ | 480/1024 [21:58:20<25:30:29, 168.80s/it]INFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:23:19 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 481/1024 [22:01:00<25:03:13, 166.10s/it] + {'loss': 0.1151, 'grad_norm': 0.003897767048329115, 'learning_rate': 1e-05, 'num_tokens': 423968050.0, 'completions/mean_length': 6666.828125, 'completions/min_length': 872.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6512.587890625, 'completions/min_terminated_length': 872.0, 'completions/max_terminated_length': 15527.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3527044653892517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019923247396945953, 'sampling/sampling_logp_difference/max': 5.7499775886535645, 'sampling/importance_sampling_ratio/min': 0.0031828521750867367, 'sampling/importance_sampling_ratio/mean': 0.9999406337738037, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9162466824054718, 'clip_ratio/low_mean': 5.0774355258909054e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2372795026749372e-05, 'clip_ratio/high_max': 3.256236095694476e-05, 'clip_ratio/region_mean': 6.314715119515313e-05, 'epoch': 0.44} + + 47%|████▋ | 481/1024 [22:01:00<25:03:13, 166.10s/it]INFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:25:59 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 482/1024 [22:03:57<25:31:21, 169.52s/it] + {'loss': 0.042, 'grad_norm': 0.003038195427507162, 'learning_rate': 1e-05, 'num_tokens': 424902953.0, 'completions/mean_length': 7159.8046875, 'completions/min_length': 1022.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7013.38916015625, 'completions/min_terminated_length': 1022.0, 'completions/max_terminated_length': 16223.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019014043733477592, 'sampling/sampling_logp_difference/max': 11.809727668762207, 'sampling/importance_sampling_ratio/min': 7.431909580191132e-06, 'sampling/importance_sampling_ratio/mean': 0.999940037727356, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8444746807217598, 'clip_ratio/low_mean': 7.980174223121139e-05, 'clip_ratio/low_min': 2.6713308216130827e-05, 'clip_ratio/high_mean': 4.791600815678976e-06, 'clip_ratio/high_max': 1.5341902098953142e-05, 'clip_ratio/region_mean': 8.459334412691533e-05, 'epoch': 0.44} + + 47%|████▋ | 482/1024 [22:03:57<25:31:21, 169.52s/it]INFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:28:57 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 483/1024 [22:06:33<24:52:27, 165.52s/it] + {'loss': 0.0548, 'grad_norm': 0.0025550283025950193, 'learning_rate': 1e-05, 'num_tokens': 425709212.0, 'completions/mean_length': 6146.2109375, 'completions/min_length': 812.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6065.5986328125, 'completions/min_terminated_length': 812.0, 'completions/max_terminated_length': 14716.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019193854182958603, 'sampling/sampling_logp_difference/max': 7.281134128570557, 'sampling/importance_sampling_ratio/min': 0.0006884043687023222, 'sampling/importance_sampling_ratio/mean': 1.0000015497207642, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8365580290555954, 'clip_ratio/low_mean': 1.55851120666739e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.55851120666739e-05, 'epoch': 0.44} + + 47%|████▋ | 483/1024 [22:06:33<24:52:27, 165.52s/it]INFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:31:33 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 484/1024 [22:09:25<25:07:49, 167.54s/it] + {'loss': 0.0698, 'grad_norm': 0.005126865580677986, 'learning_rate': 1e-05, 'num_tokens': 426566462.0, 'completions/mean_length': 6557.578125, 'completions/min_length': 437.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6321.744140625, 'completions/min_terminated_length': 437.0, 'completions/max_terminated_length': 16153.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.27852246165275574, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01839536987245083, 'sampling/sampling_logp_difference/max': 10.499993324279785, 'sampling/importance_sampling_ratio/min': 2.7536634661373682e-05, 'sampling/importance_sampling_ratio/mean': 0.9999485015869141, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8316832035779953, 'clip_ratio/low_mean': 4.780410063176532e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.036488455014478e-06, 'clip_ratio/high_max': 2.4752349872869672e-05, 'clip_ratio/region_mean': 5.484058920046664e-05, 'epoch': 0.45} + + 47%|████▋ | 484/1024 [22:09:25<25:07:49, 167.54s/it]INFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:34:25 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 485/1024 [22:12:07<24:47:59, 165.64s/it] + {'loss': 0.0753, 'grad_norm': 0.004829525947570801, 'learning_rate': 1e-05, 'num_tokens': 427480007.0, 'completions/mean_length': 7007.3203125, 'completions/min_length': 504.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6858.484375, 'completions/min_terminated_length': 504.0, 'completions/max_terminated_length': 16359.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3874102830886841, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019586069509387016, 'sampling/sampling_logp_difference/max': 8.508722305297852, 'sampling/importance_sampling_ratio/min': 0.00020170137577224523, 'sampling/importance_sampling_ratio/mean': 0.9998922944068909, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8674142584204674, 'clip_ratio/low_mean': 5.915772453590762e-05, 'clip_ratio/low_min': 1.7084812043322017e-05, 'clip_ratio/high_mean': 8.608928624198597e-06, 'clip_ratio/high_max': 3.443571449679439e-05, 'clip_ratio/region_mean': 6.776665304641938e-05, 'epoch': 0.45} + + 47%|████▋ | 485/1024 [22:12:07<24:47:59, 165.64s/it]INFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:37:06 [block_pool.py:292] Successfully reset prefix cache + + 47%|████▋ | 486/1024 [22:14:53<24:46:50, 165.82s/it] + {'loss': 0.0687, 'grad_norm': 0.003539952216669917, 'learning_rate': 1e-05, 'num_tokens': 428404968.0, 'completions/mean_length': 7069.8828125, 'completions/min_length': 421.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6922.0400390625, 'completions/min_terminated_length': 421.0, 'completions/max_terminated_length': 14748.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.3618982434272766, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020427238196134567, 'sampling/sampling_logp_difference/max': 8.332671165466309, 'sampling/importance_sampling_ratio/min': 0.00024052867956925184, 'sampling/importance_sampling_ratio/mean': 0.9999353885650635, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9066255167126656, 'clip_ratio/low_mean': 5.539863354897534e-05, 'clip_ratio/low_min': 8.211341992137022e-06, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.539863354897534e-05, 'epoch': 0.45} + + 47%|████▋ | 486/1024 [22:14:53<24:46:50, 165.82s/it]INFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:39:53 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 487/1024 [22:17:20<23:54:46, 160.31s/it] + {'loss': 0.066, 'grad_norm': 0.0030504625756293535, 'learning_rate': 1e-05, 'num_tokens': 429137176.0, 'completions/mean_length': 5586.6875, 'completions/min_length': 602.0, 'completions/max_length': 15290.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5586.6875, 'completions/min_terminated_length': 602.0, 'completions/max_terminated_length': 15290.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.3480040729045868, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.019396595656871796, 'sampling/sampling_logp_difference/max': 7.50585412979126, 'sampling/importance_sampling_ratio/min': 0.0005498559912666678, 'sampling/importance_sampling_ratio/mean': 0.9999984502792358, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9208655655384064, 'clip_ratio/low_mean': 5.576918465521885e-05, 'clip_ratio/low_min': 1.2613936178240692e-05, 'clip_ratio/high_mean': 4.137623932365386e-06, 'clip_ratio/high_max': 1.6550495729461545e-05, 'clip_ratio/region_mean': 5.99068093833921e-05, 'epoch': 0.45} + + 48%|████▊ | 487/1024 [22:17:20<23:54:46, 160.31s/it]INFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:42:20 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 488/1024 [22:19:48<23:18:44, 156.58s/it] + {'loss': -0.0077, 'grad_norm': 0.003902251599356532, 'learning_rate': 1e-05, 'num_tokens': 429836026.0, 'completions/mean_length': 5266.265625, 'completions/min_length': 492.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 4999.4404296875, 'completions/min_terminated_length': 492.0, 'completions/max_terminated_length': 15404.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2948455810546875, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01770034246146679, 'sampling/sampling_logp_difference/max': 2.868990898132324, 'sampling/importance_sampling_ratio/min': 0.05675617232918739, 'sampling/importance_sampling_ratio/mean': 0.9999457001686096, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7884859293699265, 'clip_ratio/low_mean': 3.6384140912559815e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.440377428087231e-06, 'clip_ratio/high_max': 3.3761509712348925e-05, 'clip_ratio/region_mean': 4.482451868170756e-05, 'epoch': 0.45} + + 48%|████▊ | 488/1024 [22:19:48<23:18:44, 156.58s/it]INFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:44:48 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 489/1024 [22:22:19<23:00:35, 154.83s/it] + {'loss': 0.0508, 'grad_norm': 0.0024998660665005445, 'learning_rate': 1e-05, 'num_tokens': 430673446.0, 'completions/mean_length': 6398.53125, 'completions/min_length': 699.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6319.9052734375, 'completions/min_terminated_length': 699.0, 'completions/max_terminated_length': 15754.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.31929677724838257, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020521972328424454, 'sampling/sampling_logp_difference/max': 7.397497177124023, 'sampling/importance_sampling_ratio/min': 0.000612784584518522, 'sampling/importance_sampling_ratio/mean': 0.9999797940254211, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8982341960072517, 'clip_ratio/low_mean': 4.0199149452746497e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.580925744652632e-06, 'clip_ratio/high_max': 2.2323702978610527e-05, 'clip_ratio/region_mean': 4.578007497002545e-05, 'epoch': 0.45} + + 48%|████▊ | 489/1024 [22:22:19<23:00:35, 154.83s/it]INFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:47:19 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 490/1024 [22:24:45<22:35:06, 152.26s/it] + {'loss': 0.0798, 'grad_norm': 0.00784115307033062, 'learning_rate': 1e-05, 'num_tokens': 431497546.0, 'completions/mean_length': 6277.65625, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6198.07861328125, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 14374.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.37716054916381836, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01836184598505497, 'sampling/sampling_logp_difference/max': 7.37491512298584, 'sampling/importance_sampling_ratio/min': 0.0006267798598855734, 'sampling/importance_sampling_ratio/mean': 0.999848484992981, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8139145970344543, 'clip_ratio/low_mean': 8.124458963720826e-05, 'clip_ratio/low_min': 1.2379174222587608e-05, 'clip_ratio/high_mean': 7.939156091651967e-06, 'clip_ratio/high_max': 3.1756624366607866e-05, 'clip_ratio/region_mean': 8.91837471499457e-05, 'epoch': 0.45} + + 48%|████▊ | 490/1024 [22:24:45<22:35:06, 152.26s/it]INFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:49:45 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 491/1024 [22:27:42<23:36:41, 159.48s/it] + {'loss': 0.035, 'grad_norm': 0.004277343396097422, 'learning_rate': 1e-05, 'num_tokens': 432503414.0, 'completions/mean_length': 7708.59375, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7355.9345703125, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15903.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.1633366346359253, 'frac_reward_zero_std': 0.625, 'sampling/sampling_logp_difference/mean': 0.02224145457148552, 'sampling/sampling_logp_difference/max': 11.315095901489258, 'sampling/importance_sampling_ratio/min': 1.2187546417408157e-05, 'sampling/importance_sampling_ratio/mean': 0.9999503493309021, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.087083138525486, 'clip_ratio/low_mean': 2.3825880248296016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2187512058735592e-06, 'clip_ratio/high_max': 8.875004823494237e-06, 'clip_ratio/region_mean': 2.6044631454169576e-05, 'epoch': 0.45} + + 48%|████▊ | 491/1024 [22:27:42<23:36:41, 159.48s/it]INFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:52:41 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 492/1024 [22:30:38<24:17:48, 164.41s/it] + {'loss': 0.0459, 'grad_norm': 0.006278311368077993, 'learning_rate': 1e-05, 'num_tokens': 433439137.0, 'completions/mean_length': 7162.7109375, 'completions/min_length': 842.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6865.25, 'completions/min_terminated_length': 842.0, 'completions/max_terminated_length': 15576.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2227931171655655, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02123419940471649, 'sampling/sampling_logp_difference/max': 7.499768257141113, 'sampling/importance_sampling_ratio/min': 0.0005532125360332429, 'sampling/importance_sampling_ratio/mean': 0.999966561794281, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9157010763883591, 'clip_ratio/low_mean': 3.561227788395627e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5960163182171527e-06, 'clip_ratio/high_max': 6.384065272868611e-06, 'clip_ratio/region_mean': 3.720829374742607e-05, 'epoch': 0.45} + + 48%|████▊ | 492/1024 [22:30:38<24:17:48, 164.41s/it]INFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:55:37 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 493/1024 [22:33:20<24:10:55, 163.95s/it] + {'loss': 0.0556, 'grad_norm': 0.005177734419703484, 'learning_rate': 1e-05, 'num_tokens': 434402045.0, 'completions/mean_length': 7388.90625, 'completions/min_length': 980.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7023.251953125, 'completions/min_terminated_length': 980.0, 'completions/max_terminated_length': 16123.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.37951958179473877, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01827731542289257, 'sampling/sampling_logp_difference/max': 6.096303939819336, 'sampling/importance_sampling_ratio/min': 0.0022511729039251804, 'sampling/importance_sampling_ratio/mean': 0.9999250769615173, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7670486867427826, 'clip_ratio/low_mean': 5.1716241614485625e-05, 'clip_ratio/low_min': 3.601579010137357e-06, 'clip_ratio/high_mean': 8.656040449750435e-06, 'clip_ratio/high_max': 2.846911434062349e-05, 'clip_ratio/region_mean': 6.037228104105452e-05, 'epoch': 0.45} + + 48%|████▊ | 493/1024 [22:33:20<24:10:55, 163.95s/it]INFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 11:58:20 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 494/1024 [22:36:15<24:36:02, 167.10s/it] + {'loss': 0.0662, 'grad_norm': 0.0032320048194378614, 'learning_rate': 1e-05, 'num_tokens': 435292029.0, 'completions/mean_length': 6805.375, 'completions/min_length': 587.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6496.38671875, 'completions/min_terminated_length': 587.0, 'completions/max_terminated_length': 15767.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018761277198791504, 'sampling/sampling_logp_difference/max': 9.613814353942871, 'sampling/importance_sampling_ratio/min': 6.679954094579443e-05, 'sampling/importance_sampling_ratio/mean': 0.9999642372131348, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8407405763864517, 'clip_ratio/low_mean': 7.719641234871233e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.257203722270788e-06, 'clip_ratio/high_max': 2.1548471977439476e-05, 'clip_ratio/region_mean': 8.345361538886209e-05, 'epoch': 0.45} + + 48%|████▊ | 494/1024 [22:36:15<24:36:02, 167.10s/it]INFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:01:14 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 495/1024 [22:38:54<24:12:53, 164.79s/it] + {'loss': 0.0279, 'grad_norm': 0.0030854379292577505, 'learning_rate': 1e-05, 'num_tokens': 436046842.0, 'completions/mean_length': 5753.4140625, 'completions/min_length': 946.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 5321.2763671875, 'completions/min_terminated_length': 946.0, 'completions/max_terminated_length': 15105.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.31405961513519287, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017733070999383926, 'sampling/sampling_logp_difference/max': 19.24954605102539, 'sampling/importance_sampling_ratio/min': 4.36544311810394e-09, 'sampling/importance_sampling_ratio/mean': 0.9998626708984375, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7848984077572823, 'clip_ratio/low_mean': 7.76378024056612e-05, 'clip_ratio/low_min': 1.7026316072588088e-05, 'clip_ratio/high_mean': 8.65123752191721e-07, 'clip_ratio/high_max': 3.460495008766884e-06, 'clip_ratio/region_mean': 7.850292649891344e-05, 'epoch': 0.46} + + 48%|████▊ | 495/1024 [22:38:54<24:12:53, 164.79s/it]INFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:03:54 [block_pool.py:292] Successfully reset prefix cache + + 48%|████▊ | 496/1024 [22:41:33<23:54:03, 162.96s/it] + {'loss': 0.0805, 'grad_norm': 0.003124243812635541, 'learning_rate': 1e-05, 'num_tokens': 436899638.0, 'completions/mean_length': 6522.84375, 'completions/min_length': 1062.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6445.19677734375, 'completions/min_terminated_length': 1062.0, 'completions/max_terminated_length': 15682.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2706219553947449, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.021180003881454468, 'sampling/sampling_logp_difference/max': 12.316575050354004, 'sampling/importance_sampling_ratio/min': 4.476920821616659e-06, 'sampling/importance_sampling_ratio/mean': 0.9999418258666992, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0593653172254562, 'clip_ratio/low_mean': 3.234025916754035e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.301897092773288e-06, 'clip_ratio/high_max': 1.7207588371093152e-05, 'clip_ratio/region_mean': 3.664215591925313e-05, 'epoch': 0.46} + + 48%|████▊ | 496/1024 [22:41:33<23:54:03, 162.96s/it]INFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:06:33 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▊ | 497/1024 [22:44:12<23:40:16, 161.70s/it] + {'loss': 0.0817, 'grad_norm': 0.005001795012503862, 'learning_rate': 1e-05, 'num_tokens': 437713008.0, 'completions/mean_length': 6203.203125, 'completions/min_length': 1017.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5874.7900390625, 'completions/min_terminated_length': 1017.0, 'completions/max_terminated_length': 14515.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.26143795251846313, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017751028761267662, 'sampling/sampling_logp_difference/max': 6.34374475479126, 'sampling/importance_sampling_ratio/min': 0.001757707679644227, 'sampling/importance_sampling_ratio/mean': 0.9999101758003235, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8152795508503914, 'clip_ratio/low_mean': 2.8437304308681632e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.9476084932866797e-06, 'clip_ratio/high_max': 1.1790433973146719e-05, 'clip_ratio/region_mean': 3.138491274512489e-05, 'epoch': 0.46} + + 49%|████▊ | 497/1024 [22:44:12<23:40:16, 161.70s/it]INFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:09:11 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▊ | 498/1024 [22:46:41<23:05:51, 158.08s/it] + {'loss': 0.0759, 'grad_norm': 0.005084732081741095, 'learning_rate': 1e-05, 'num_tokens': 438495811.0, 'completions/mean_length': 5975.5234375, 'completions/min_length': 690.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5725.72021484375, 'completions/min_terminated_length': 690.0, 'completions/max_terminated_length': 15423.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.28513264656066895, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018671832978725433, 'sampling/sampling_logp_difference/max': 10.374839782714844, 'sampling/importance_sampling_ratio/min': 3.120788460364565e-05, 'sampling/importance_sampling_ratio/mean': 0.9998699426651001, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8275932744145393, 'clip_ratio/low_mean': 4.4599403963729856e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.229499381835922e-06, 'clip_ratio/high_max': 1.3163793028070359e-05, 'clip_ratio/region_mean': 4.882890357293945e-05, 'epoch': 0.46} + + 49%|████▊ | 498/1024 [22:46:41<23:05:51, 158.08s/it]INFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:11:41 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▊ | 499/1024 [22:49:11<22:41:04, 155.55s/it] + {'loss': 0.0282, 'grad_norm': 0.002567912917584181, 'learning_rate': 1e-05, 'num_tokens': 439413055.0, 'completions/mean_length': 7019.59375, 'completions/min_length': 1058.0, 'completions/max_length': 16110.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7019.59375, 'completions/min_terminated_length': 1058.0, 'completions/max_terminated_length': 16110.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.02012534812092781, 'sampling/sampling_logp_difference/max': 6.876677513122559, 'sampling/importance_sampling_ratio/min': 0.0010315657127648592, 'sampling/importance_sampling_ratio/mean': 1.0000476837158203, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9266618490219116, 'clip_ratio/low_mean': 3.0413870263146237e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.07410060588154e-07, 'clip_ratio/high_max': 3.229640242352616e-06, 'clip_ratio/region_mean': 3.1221280551108066e-05, 'epoch': 0.46} + + 49%|████▊ | 499/1024 [22:49:11<22:41:04, 155.55s/it]INFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:14:11 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▉ | 500/1024 [22:52:01<23:17:03, 159.97s/it] + {'loss': 0.0617, 'grad_norm': 0.004862098954617977, 'learning_rate': 1e-05, 'num_tokens': 440375128.0, 'completions/mean_length': 7373.3203125, 'completions/min_length': 854.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7082.65283203125, 'completions/min_terminated_length': 854.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020596595481038094, 'sampling/sampling_logp_difference/max': 7.28115701675415, 'sampling/importance_sampling_ratio/min': 0.0006883886526338756, 'sampling/importance_sampling_ratio/mean': 0.9999188780784607, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9383682310581207, 'clip_ratio/low_mean': 4.08189575864526e-05, 'clip_ratio/low_min': 4.041122338094283e-06, 'clip_ratio/high_mean': 4.5819448359907256e-06, 'clip_ratio/high_max': 1.8327779343962902e-05, 'clip_ratio/region_mean': 4.5400901854009135e-05, 'epoch': 0.46} + + 49%|████▉ | 500/1024 [22:52:01<23:17:03, 159.97s/it]INFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:17:01 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▉ | 501/1024 [22:54:38<23:05:06, 158.90s/it] + {'loss': 0.0316, 'grad_norm': 0.003041388699784875, 'learning_rate': 1e-05, 'num_tokens': 441156306.0, 'completions/mean_length': 5944.953125, 'completions/min_length': 330.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5862.755859375, 'completions/min_terminated_length': 330.0, 'completions/max_terminated_length': 16280.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3345639705657959, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019817989319562912, 'sampling/sampling_logp_difference/max': 7.171038627624512, 'sampling/importance_sampling_ratio/min': 0.0007685241289436817, 'sampling/importance_sampling_ratio/mean': 0.9999566078186035, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9130716845393181, 'clip_ratio/low_mean': 6.364750265674957e-05, 'clip_ratio/low_min': 3.94595599573222e-06, 'clip_ratio/high_mean': 4.12654787851352e-06, 'clip_ratio/high_max': 1.650619151405408e-05, 'clip_ratio/region_mean': 6.77740499668289e-05, 'epoch': 0.46} + + 49%|████▉ | 501/1024 [22:54:38<23:05:06, 158.90s/it]INFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:19:37 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▉ | 502/1024 [22:57:32<23:42:47, 163.54s/it] + {'loss': 0.0306, 'grad_norm': 0.005679543130099773, 'learning_rate': 1e-05, 'num_tokens': 442032972.0, 'completions/mean_length': 6686.015625, 'completions/min_length': 1018.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6609.6533203125, 'completions/min_terminated_length': 1018.0, 'completions/max_terminated_length': 16181.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.24988999962806702, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019738182425498962, 'sampling/sampling_logp_difference/max': 4.86245584487915, 'sampling/importance_sampling_ratio/min': 0.007731473073363304, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8640913739800453, 'clip_ratio/low_mean': 3.147234815514821e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.205811807078135e-06, 'clip_ratio/high_max': 2.9951792839710834e-05, 'clip_ratio/region_mean': 4.0678160075913183e-05, 'epoch': 0.46} + + 49%|████▉ | 502/1024 [22:57:32<23:42:47, 163.54s/it]INFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:22:32 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▉ | 503/1024 [23:00:37<24:34:44, 169.84s/it] + {'loss': 0.0756, 'grad_norm': 0.006176612339913845, 'learning_rate': 1e-05, 'num_tokens': 442940940.0, 'completions/mean_length': 6945.5, 'completions/min_length': 1031.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6231.6640625, 'completions/min_terminated_length': 1031.0, 'completions/max_terminated_length': 15951.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.29644322395324707, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01836501806974411, 'sampling/sampling_logp_difference/max': 8.607227325439453, 'sampling/importance_sampling_ratio/min': 0.00018278000061400235, 'sampling/importance_sampling_ratio/mean': 0.9999117851257324, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8156519457697868, 'clip_ratio/low_mean': 3.858067566397949e-05, 'clip_ratio/low_min': 9.290916750614997e-06, 'clip_ratio/high_mean': 7.5476494316717435e-06, 'clip_ratio/high_max': 3.0190597726686974e-05, 'clip_ratio/region_mean': 4.612832617567619e-05, 'epoch': 0.46} + + 49%|████▉ | 503/1024 [23:00:37<24:34:44, 169.84s/it]INFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:25:36 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▉ | 504/1024 [23:03:54<25:43:39, 178.11s/it] + {'loss': 0.0386, 'grad_norm': 0.0021770994644612074, 'learning_rate': 1e-05, 'num_tokens': 443992041.0, 'completions/mean_length': 8068.5390625, 'completions/min_length': 875.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.078125, 'completions/mean_terminated_length': 7363.8388671875, 'completions/min_terminated_length': 875.0, 'completions/max_terminated_length': 15847.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.30115634202957153, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019003838300704956, 'sampling/sampling_logp_difference/max': 8.624998092651367, 'sampling/importance_sampling_ratio/min': 0.0001795605494407937, 'sampling/importance_sampling_ratio/mean': 0.9999759197235107, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8196670189499855, 'clip_ratio/low_mean': 3.060894187001395e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.28071654773521e-06, 'clip_ratio/high_max': 2.2105000425653998e-05, 'clip_ratio/region_mean': 3.6889658531436e-05, 'epoch': 0.46} + + 49%|████▉ | 504/1024 [23:03:54<25:43:39, 178.11s/it]INFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:28:54 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▉ | 505/1024 [23:06:29<24:41:10, 171.23s/it] + {'loss': 0.063, 'grad_norm': 0.00788798462599516, 'learning_rate': 1e-05, 'num_tokens': 444679675.0, 'completions/mean_length': 5209.140625, 'completions/min_length': 136.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5031.76220703125, 'completions/min_terminated_length': 136.0, 'completions/max_terminated_length': 15168.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.33220988512039185, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018808994442224503, 'sampling/sampling_logp_difference/max': 8.267484664916992, 'sampling/importance_sampling_ratio/min': 0.00025673024356365204, 'sampling/importance_sampling_ratio/mean': 0.9999796748161316, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8851845487952232, 'clip_ratio/low_mean': 4.5685408849749365e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.2181025062527624e-06, 'clip_ratio/high_max': 1.287241002501105e-05, 'clip_ratio/region_mean': 4.89035115833758e-05, 'epoch': 0.46} + + 49%|████▉ | 505/1024 [23:06:29<24:41:10, 171.23s/it]INFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:31:29 [block_pool.py:292] Successfully reset prefix cache + + 49%|████▉ | 506/1024 [23:09:26<24:53:12, 172.96s/it] + {'loss': 0.1586, 'grad_norm': 0.004547314252704382, 'learning_rate': 1e-05, 'num_tokens': 445668126.0, 'completions/mean_length': 7558.8984375, 'completions/min_length': 707.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7274.21728515625, 'completions/min_terminated_length': 707.0, 'completions/max_terminated_length': 16259.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.42293959856033325, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.02099413052201271, 'sampling/sampling_logp_difference/max': 9.059958457946777, 'sampling/importance_sampling_ratio/min': 0.00011622780584730208, 'sampling/importance_sampling_ratio/mean': 0.999848484992981, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.003449946641922, 'clip_ratio/low_mean': 5.944662643742049e-05, 'clip_ratio/low_min': 8.106994755507912e-06, 'clip_ratio/high_mean': 6.590465602585027e-06, 'clip_ratio/high_max': 2.294301202709903e-05, 'clip_ratio/region_mean': 6.603709243790945e-05, 'epoch': 0.47} + + 49%|████▉ | 506/1024 [23:09:26<24:53:12, 172.96s/it]INFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:34:26 [block_pool.py:292] Successfully reset prefix cache + + 50%|████▉ | 507/1024 [23:12:21<24:55:19, 173.54s/it] + {'loss': 0.121, 'grad_norm': 0.004621773958206177, 'learning_rate': 1e-05, 'num_tokens': 446464587.0, 'completions/mean_length': 6066.6015625, 'completions/min_length': 1107.0, 'completions/max_length': 16137.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6066.6015625, 'completions/min_terminated_length': 1107.0, 'completions/max_terminated_length': 16137.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3537652790546417, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018016980960965157, 'sampling/sampling_logp_difference/max': 11.179987907409668, 'sampling/importance_sampling_ratio/min': 1.3950601896794979e-05, 'sampling/importance_sampling_ratio/mean': 1.0000154972076416, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8450648710131645, 'clip_ratio/low_mean': 8.880347786544007e-05, 'clip_ratio/low_min': 9.06585455595632e-06, 'clip_ratio/high_mean': 6.047981628398702e-06, 'clip_ratio/high_max': 2.1350435872591333e-05, 'clip_ratio/region_mean': 9.485145938015194e-05, 'epoch': 0.47} + + 50%|████▉ | 507/1024 [23:12:21<24:55:19, 173.54s/it]INFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:37:21 [block_pool.py:292] Successfully reset prefix cache + + 50%|████▉ | 508/1024 [23:15:04<24:26:00, 170.47s/it] + {'loss': 0.0396, 'grad_norm': 0.004523546434938908, 'learning_rate': 1e-05, 'num_tokens': 447381134.0, 'completions/mean_length': 6988.0234375, 'completions/min_length': 571.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6838.88134765625, 'completions/min_terminated_length': 571.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.22567617893218994, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021812722086906433, 'sampling/sampling_logp_difference/max': 4.124781131744385, 'sampling/importance_sampling_ratio/min': 0.016167031601071358, 'sampling/importance_sampling_ratio/mean': 0.9999901056289673, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0452716201543808, 'clip_ratio/low_mean': 2.149350007130124e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.633681207153131e-07, 'clip_ratio/high_max': 3.0534724828612525e-06, 'clip_ratio/region_mean': 2.2256868305703392e-05, 'epoch': 0.47} + + 50%|████▉ | 508/1024 [23:15:04<24:26:00, 170.47s/it]INFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:40:04 [block_pool.py:292] Successfully reset prefix cache + + 50%|████▉ | 509/1024 [23:17:09<22:24:40, 156.66s/it] + {'loss': 0.0188, 'grad_norm': 0.004002885892987251, 'learning_rate': 1e-05, 'num_tokens': 448158014.0, 'completions/mean_length': 5948.5, 'completions/min_length': 1252.0, 'completions/max_length': 12316.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5948.5, 'completions/min_terminated_length': 1252.0, 'completions/max_terminated_length': 12316.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.3124620020389557, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018487900495529175, 'sampling/sampling_logp_difference/max': 7.062494277954102, 'sampling/importance_sampling_ratio/min': 0.0008566387114115059, 'sampling/importance_sampling_ratio/mean': 0.9999228715896606, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8241566568613052, 'clip_ratio/low_mean': 3.684896307731833e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3968978009870625e-06, 'clip_ratio/high_max': 5.58759120394825e-06, 'clip_ratio/region_mean': 3.824586099199223e-05, 'epoch': 0.47} + + 50%|████▉ | 509/1024 [23:17:09<22:24:40, 156.66s/it]INFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:42:08 [block_pool.py:292] Successfully reset prefix cache + + 50%|████▉ | 510/1024 [23:20:04<23:08:43, 162.11s/it] + {'loss': 0.0787, 'grad_norm': 0.0019062751671299338, 'learning_rate': 1e-05, 'num_tokens': 449197054.0, 'completions/mean_length': 7966.375, 'completions/min_length': 660.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7764.3525390625, 'completions/min_terminated_length': 660.0, 'completions/max_terminated_length': 16044.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020015282556414604, 'sampling/sampling_logp_difference/max': 8.731462478637695, 'sampling/importance_sampling_ratio/min': 0.0001614262000657618, 'sampling/importance_sampling_ratio/mean': 0.9999173879623413, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8868448063731194, 'clip_ratio/low_mean': 3.973086239739132e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.439610338773491e-06, 'clip_ratio/high_max': 1.0490723752809572e-05, 'clip_ratio/region_mean': 4.3170473020381905e-05, 'epoch': 0.47} + + 50%|████▉ | 510/1024 [23:20:04<23:08:43, 162.11s/it]INFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:45:03 [block_pool.py:292] Successfully reset prefix cache + + 50%|████▉ | 511/1024 [23:22:36<22:41:53, 159.29s/it] + {'loss': 0.0547, 'grad_norm': 0.00490277074277401, 'learning_rate': 1e-05, 'num_tokens': 450050153.0, 'completions/mean_length': 6520.0234375, 'completions/min_length': 461.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6442.3544921875, 'completions/min_terminated_length': 461.0, 'completions/max_terminated_length': 16124.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3437528908252716, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020178331062197685, 'sampling/sampling_logp_difference/max': 12.324441909790039, 'sampling/importance_sampling_ratio/min': 4.4418397919798736e-06, 'sampling/importance_sampling_ratio/mean': 0.9998800754547119, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9168323278427124, 'clip_ratio/low_mean': 3.558348203114292e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0487764206554857e-06, 'clip_ratio/high_max': 1.2195105682621943e-05, 'clip_ratio/region_mean': 3.8632259474979946e-05, 'epoch': 0.47} + + 50%|████▉ | 511/1024 [23:22:36<22:41:53, 159.29s/it]INFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:47:36 [block_pool.py:292] Successfully reset prefix cache + + 50%|█████ | 512/1024 [23:25:44<23:52:32, 167.88s/it] + {'loss': -0.0023, 'grad_norm': 0.003792276605963707, 'learning_rate': 1e-05, 'num_tokens': 450915281.0, 'completions/mean_length': 6614.5625, 'completions/min_length': 429.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6217.4306640625, 'completions/min_terminated_length': 429.0, 'completions/max_terminated_length': 16252.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.20069602131843567, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019233014434576035, 'sampling/sampling_logp_difference/max': 5.40609884262085, 'sampling/importance_sampling_ratio/min': 0.004489119164645672, 'sampling/importance_sampling_ratio/mean': 0.9999154806137085, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8635925352573395, 'clip_ratio/low_mean': 3.363800146871654e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.989432121263235e-06, 'clip_ratio/high_max': 7.95772848505294e-06, 'clip_ratio/region_mean': 3.562743381735345e-05, 'epoch': 0.47} + + 50%|█████ | 512/1024 [23:25:44<23:52:32, 167.88s/it]INFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:50:46 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 50%|█████ | 513/1024 [23:28:38<24:05:47, 169.76s/it] + {'loss': 0.0287, 'grad_norm': 0.0031763892620801926, 'learning_rate': 1e-05, 'num_tokens': 451761322.0, 'completions/mean_length': 6458.5078125, 'completions/min_length': 1025.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 5970.36865234375, 'completions/min_terminated_length': 1025.0, 'completions/max_terminated_length': 16206.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.282474160194397, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01935420371592045, 'sampling/sampling_logp_difference/max': 9.24997615814209, 'sampling/importance_sampling_ratio/min': 9.611394489184022e-05, 'sampling/importance_sampling_ratio/mean': 0.9999036192893982, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8816124573349953, 'clip_ratio/low_mean': 3.4846169796765025e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.67555605105008e-06, 'clip_ratio/high_max': 1.6306271390931215e-05, 'clip_ratio/region_mean': 4.1521726302562456e-05, 'epoch': 0.47} + + 50%|█████ | 513/1024 [23:28:38<24:05:47, 169.76s/it]INFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:53:38 [block_pool.py:292] Successfully reset prefix cache + + 50%|█████ | 514/1024 [23:31:14<23:26:07, 165.43s/it] + {'loss': 0.1094, 'grad_norm': 0.004134794697165489, 'learning_rate': 1e-05, 'num_tokens': 452526342.0, 'completions/mean_length': 5844.03125, 'completions/min_length': 237.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5676.73046875, 'completions/min_terminated_length': 237.0, 'completions/max_terminated_length': 15928.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.28930899500846863, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02013866975903511, 'sampling/sampling_logp_difference/max': 8.951433181762695, 'sampling/importance_sampling_ratio/min': 0.00012955136480741203, 'sampling/importance_sampling_ratio/mean': 0.9999297857284546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9008020162582397, 'clip_ratio/low_mean': 2.2518463538290234e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0314158721012063e-06, 'clip_ratio/high_max': 7.861634912842419e-06, 'clip_ratio/region_mean': 2.554987941039144e-05, 'epoch': 0.47} + + 50%|█████ | 514/1024 [23:31:14<23:26:07, 165.43s/it]INFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:56:13 [block_pool.py:292] Successfully reset prefix cache + + 50%|█████ | 515/1024 [23:33:57<23:18:56, 164.90s/it] + {'loss': 0.0193, 'grad_norm': 0.0022520655766129494, 'learning_rate': 1e-05, 'num_tokens': 453343385.0, 'completions/mean_length': 6214.5859375, 'completions/min_length': 1096.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6134.51171875, 'completions/min_terminated_length': 1096.0, 'completions/max_terminated_length': 16180.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.20623260736465454, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019947605207562447, 'sampling/sampling_logp_difference/max': 10.187482833862305, 'sampling/importance_sampling_ratio/min': 3.763851054827683e-05, 'sampling/importance_sampling_ratio/mean': 0.9999879598617554, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9522949978709221, 'clip_ratio/low_mean': 2.444096298859222e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.177790176778217e-06, 'clip_ratio/high_max': 1.2711160707112867e-05, 'clip_ratio/region_mean': 2.761875293799676e-05, 'epoch': 0.47} + + 50%|█████ | 515/1024 [23:33:57<23:18:56, 164.90s/it]INFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 12:58:57 [block_pool.py:292] Successfully reset prefix cache + + 50%|█████ | 516/1024 [23:36:44<23:21:12, 165.50s/it] + {'loss': 0.0609, 'grad_norm': 0.004887089133262634, 'learning_rate': 1e-05, 'num_tokens': 454275379.0, 'completions/mean_length': 7138.515625, 'completions/min_length': 846.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7065.71630859375, 'completions/min_terminated_length': 846.0, 'completions/max_terminated_length': 14376.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.32035762071609497, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019449077546596527, 'sampling/sampling_logp_difference/max': 5.312184810638428, 'sampling/importance_sampling_ratio/min': 0.004931141622364521, 'sampling/importance_sampling_ratio/mean': 0.9999544620513916, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8856206461787224, 'clip_ratio/low_mean': 3.371703428456385e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.431061753009999e-05, 'clip_ratio/high_max': 5.724247012039996e-05, 'clip_ratio/region_mean': 4.8027652155724354e-05, 'epoch': 0.47} + + 50%|█████ | 516/1024 [23:36:44<23:21:12, 165.50s/it]INFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:01:44 [block_pool.py:292] Successfully reset prefix cache + + 50%|█████ | 517/1024 [23:39:19<22:50:46, 162.22s/it] + {'loss': 0.0366, 'grad_norm': 0.003875041613355279, 'learning_rate': 1e-05, 'num_tokens': 455076625.0, 'completions/mean_length': 6077.796875, 'completions/min_length': 954.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5915.00830078125, 'completions/min_terminated_length': 954.0, 'completions/max_terminated_length': 15855.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.23933593928813934, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018907926976680756, 'sampling/sampling_logp_difference/max': 10.31219482421875, 'sampling/importance_sampling_ratio/min': 3.322543852846138e-05, 'sampling/importance_sampling_ratio/mean': 1.0000392198562622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.862022191286087, 'clip_ratio/low_mean': 4.936055870530254e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9019220139380195e-06, 'clip_ratio/high_max': 1.5607688055752078e-05, 'clip_ratio/region_mean': 5.326248106030107e-05, 'epoch': 0.48} + + 50%|█████ | 517/1024 [23:39:19<22:50:46, 162.22s/it]INFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:04:18 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████ | 518/1024 [23:41:40<21:55:15, 155.96s/it] + {'loss': 0.0822, 'grad_norm': 0.004288897849619389, 'learning_rate': 1e-05, 'num_tokens': 455889693.0, 'completions/mean_length': 6211.65625, 'completions/min_length': 1292.0, 'completions/max_length': 15316.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6211.65625, 'completions/min_terminated_length': 1292.0, 'completions/max_terminated_length': 15316.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.27145031094551086, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01986120268702507, 'sampling/sampling_logp_difference/max': 12.874927520751953, 'sampling/importance_sampling_ratio/min': 2.5614745027269237e-06, 'sampling/importance_sampling_ratio/mean': 0.9999270439147949, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8835236355662346, 'clip_ratio/low_mean': 3.7409978290270374e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.535163386914064e-06, 'clip_ratio/high_max': 1.0557040241110371e-05, 'clip_ratio/region_mean': 4.0945141790871276e-05, 'epoch': 0.48} + + 51%|█████ | 518/1024 [23:41:40<21:55:15, 155.96s/it]INFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:06:40 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████ | 519/1024 [23:44:24<22:13:33, 158.44s/it] + {'loss': 0.0311, 'grad_norm': 0.004230308346450329, 'learning_rate': 1e-05, 'num_tokens': 456809643.0, 'completions/mean_length': 7035.609375, 'completions/min_length': 762.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6962.0, 'completions/min_terminated_length': 762.0, 'completions/max_terminated_length': 16128.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.17282497882843018, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020262110978364944, 'sampling/sampling_logp_difference/max': 10.99984073638916, 'sampling/importance_sampling_ratio/min': 1.670435995038133e-05, 'sampling/importance_sampling_ratio/mean': 0.9999722242355347, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9033957049250603, 'clip_ratio/low_mean': 3.578249538804812e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.105663826223463e-07, 'clip_ratio/high_max': 2.842265530489385e-06, 'clip_ratio/region_mean': 3.649306199804414e-05, 'epoch': 0.48} + + 51%|█████ | 519/1024 [23:44:24<22:13:33, 158.44s/it]INFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:09:24 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████ | 520/1024 [23:47:07<22:21:16, 159.68s/it] + {'loss': 0.0204, 'grad_norm': 0.0029154124204069376, 'learning_rate': 1e-05, 'num_tokens': 457669431.0, 'completions/mean_length': 6557.40625, 'completions/min_length': 1136.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6321.568359375, 'completions/min_terminated_length': 1136.0, 'completions/max_terminated_length': 16241.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019474683329463005, 'sampling/sampling_logp_difference/max': 9.746816635131836, 'sampling/importance_sampling_ratio/min': 5.8480534789850935e-05, 'sampling/importance_sampling_ratio/mean': 1.0000226497650146, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8352414071559906, 'clip_ratio/low_mean': 2.8534720058814855e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.844010264714598e-06, 'clip_ratio/high_max': 3.539844283295679e-05, 'clip_ratio/region_mean': 3.837873060774655e-05, 'epoch': 0.48} + + 51%|█████ | 520/1024 [23:47:07<22:21:16, 159.68s/it]INFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:12:07 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████ | 521/1024 [23:49:54<22:36:20, 161.79s/it] + {'loss': 0.0248, 'grad_norm': 0.0025195449125021696, 'learning_rate': 1e-05, 'num_tokens': 458512648.0, 'completions/mean_length': 6444.1328125, 'completions/min_length': 398.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6205.576171875, 'completions/min_terminated_length': 398.0, 'completions/max_terminated_length': 15428.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2585597634315491, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01779567077755928, 'sampling/sampling_logp_difference/max': 10.624913215637207, 'sampling/importance_sampling_ratio/min': 2.4302940801135264e-05, 'sampling/importance_sampling_ratio/mean': 0.999996542930603, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7480100840330124, 'clip_ratio/low_mean': 5.166920755073079e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.917558859076962e-05, 'clip_ratio/high_max': 6.400114170901361e-05, 'clip_ratio/region_mean': 7.084479466357152e-05, 'epoch': 0.48} + + 51%|█████ | 521/1024 [23:49:54<22:36:20, 161.79s/it]INFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:14:53 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████ | 522/1024 [23:52:43<22:51:47, 163.96s/it] + {'loss': 0.0608, 'grad_norm': 0.004339073318988085, 'learning_rate': 1e-05, 'num_tokens': 459377790.0, 'completions/mean_length': 6615.234375, 'completions/min_length': 105.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6380.7841796875, 'completions/min_terminated_length': 105.0, 'completions/max_terminated_length': 15868.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.31064465641975403, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018815383315086365, 'sampling/sampling_logp_difference/max': 7.76359748840332, 'sampling/importance_sampling_ratio/min': 0.00042492515058256686, 'sampling/importance_sampling_ratio/mean': 0.9999370574951172, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8428665772080421, 'clip_ratio/low_mean': 3.4855478702411347e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.87236081375886e-07, 'clip_ratio/high_max': 2.748944325503544e-06, 'clip_ratio/region_mean': 3.5542715181691165e-05, 'epoch': 0.48} + + 51%|█████ | 522/1024 [23:52:43<22:51:47, 163.96s/it]INFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:17:42 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████ | 523/1024 [23:55:16<22:21:33, 160.67s/it] + {'loss': 0.0502, 'grad_norm': 0.005003004334867001, 'learning_rate': 1e-05, 'num_tokens': 460189823.0, 'completions/mean_length': 6200.3203125, 'completions/min_length': 1032.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5955.912109375, 'completions/min_terminated_length': 1032.0, 'completions/max_terminated_length': 15239.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.0192951001226902, 'sampling/sampling_logp_difference/max': 5.2945051193237305, 'sampling/importance_sampling_ratio/min': 0.005019097588956356, 'sampling/importance_sampling_ratio/mean': 0.9999645948410034, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9044734612107277, 'clip_ratio/low_mean': 2.2591082483813807e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.496596083456097e-06, 'clip_ratio/high_max': 2.2513844896820956e-05, 'clip_ratio/region_mean': 3.0087678169365972e-05, 'epoch': 0.48} + + 51%|█████ | 523/1024 [23:55:16<22:21:33, 160.67s/it]INFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:20:15 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████ | 524/1024 [23:57:34<21:24:07, 154.10s/it] + {'loss': 0.0209, 'grad_norm': 0.005491400603204966, 'learning_rate': 1e-05, 'num_tokens': 460944164.0, 'completions/mean_length': 5758.9140625, 'completions/min_length': 1181.0, 'completions/max_length': 15706.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5758.9140625, 'completions/min_terminated_length': 1181.0, 'completions/max_terminated_length': 15706.0, 'rewards/accuracy_reward/mean': 0.5859375, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.5859375, 'reward_std': 0.2330428510904312, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019315458834171295, 'sampling/sampling_logp_difference/max': 5.54492712020874, 'sampling/importance_sampling_ratio/min': 0.003907227888703346, 'sampling/importance_sampling_ratio/mean': 0.9999998807907104, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8783154934644699, 'clip_ratio/low_mean': 3.145246773783583e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.771700446326577e-06, 'clip_ratio/high_max': 1.9086801785306307e-05, 'clip_ratio/region_mean': 3.622416772941506e-05, 'epoch': 0.48} + + 51%|█████ | 524/1024 [23:57:34<21:24:07, 154.10s/it]INFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:22:34 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████▏ | 525/1024 [24:00:21<21:52:06, 157.77s/it] + {'loss': 0.0103, 'grad_norm': 0.0038622859865427017, 'learning_rate': 1e-05, 'num_tokens': 461931916.0, 'completions/mean_length': 7573.375, 'completions/min_length': 1579.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7504.0, 'completions/min_terminated_length': 1579.0, 'completions/max_terminated_length': 15536.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.14123955368995667, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.02145528793334961, 'sampling/sampling_logp_difference/max': 6.1500749588012695, 'sampling/importance_sampling_ratio/min': 0.002133321948349476, 'sampling/importance_sampling_ratio/mean': 0.9999769926071167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.057753436267376, 'clip_ratio/low_mean': 9.616303373150004e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.888714672939386e-06, 'clip_ratio/high_max': 1.5554858691757545e-05, 'clip_ratio/region_mean': 1.3505018273463065e-05, 'epoch': 0.48} + + 51%|█████▏ | 525/1024 [24:00:21<21:52:06, 157.77s/it]INFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:25:20 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████▏ | 526/1024 [24:02:58<21:47:21, 157.51s/it] + {'loss': 0.0506, 'grad_norm': 0.002902502194046974, 'learning_rate': 1e-05, 'num_tokens': 462894701.0, 'completions/mean_length': 7353.0703125, 'completions/min_length': 907.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7136.328125, 'completions/min_terminated_length': 907.0, 'completions/max_terminated_length': 14553.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.021296534687280655, 'sampling/sampling_logp_difference/max': 5.312461853027344, 'sampling/importance_sampling_ratio/min': 0.00492977537214756, 'sampling/importance_sampling_ratio/mean': 0.9999150037765503, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9386680871248245, 'clip_ratio/low_mean': 4.7102344296945375e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.324094329102081e-06, 'clip_ratio/high_max': 2.2185531634022482e-05, 'clip_ratio/region_mean': 5.342643908079481e-05, 'epoch': 0.48} + + 51%|█████▏ | 526/1024 [24:02:58<21:47:21, 157.51s/it]INFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:27:57 [block_pool.py:292] Successfully reset prefix cache + + 51%|█████▏ | 527/1024 [24:05:41<21:58:00, 159.11s/it] + {'loss': 0.0546, 'grad_norm': 0.002602500608190894, 'learning_rate': 1e-05, 'num_tokens': 463849087.0, 'completions/mean_length': 7280.953125, 'completions/min_length': 1111.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6987.30615234375, 'completions/min_terminated_length': 1111.0, 'completions/max_terminated_length': 15851.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2290911078453064, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020630592480301857, 'sampling/sampling_logp_difference/max': 10.12484359741211, 'sampling/importance_sampling_ratio/min': 4.007156167062931e-05, 'sampling/importance_sampling_ratio/mean': 0.9999302625656128, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9424067437648773, 'clip_ratio/low_mean': 5.111583186589996e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.666198492486728e-06, 'clip_ratio/high_max': 1.8664793969946913e-05, 'clip_ratio/region_mean': 5.578203035838669e-05, 'epoch': 0.48} + + 51%|█████▏ | 527/1024 [24:05:41<21:58:00, 159.11s/it]INFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:30:40 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 528/1024 [24:07:59<21:03:43, 152.87s/it] + {'loss': 0.1494, 'grad_norm': 0.005743890535086393, 'learning_rate': 1e-05, 'num_tokens': 464704336.0, 'completions/mean_length': 6520.6328125, 'completions/min_length': 1459.0, 'completions/max_length': 14628.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6520.6328125, 'completions/min_terminated_length': 1459.0, 'completions/max_terminated_length': 14628.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3413938879966736, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018370801582932472, 'sampling/sampling_logp_difference/max': 9.74838638305664, 'sampling/importance_sampling_ratio/min': 5.838880315423012e-05, 'sampling/importance_sampling_ratio/mean': 0.9999988079071045, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8501213267445564, 'clip_ratio/low_mean': 4.5688502041230095e-05, 'clip_ratio/low_min': 5.72383623875794e-06, 'clip_ratio/high_mean': 1.0150766001970624e-05, 'clip_ratio/high_max': 3.77411461158772e-05, 'clip_ratio/region_mean': 5.583926849794807e-05, 'epoch': 0.49} + + 52%|█████▏ | 528/1024 [24:07:59<21:03:43, 152.87s/it]INFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:32:58 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 529/1024 [24:10:23<20:39:19, 150.22s/it] + {'loss': 0.0967, 'grad_norm': 0.004826955031603575, 'learning_rate': 1e-05, 'num_tokens': 465632152.0, 'completions/mean_length': 7111.0, 'completions/min_length': 1288.0, 'completions/max_length': 14675.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7111.0, 'completions/min_terminated_length': 1288.0, 'completions/max_terminated_length': 14675.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2975040376186371, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019976403564214706, 'sampling/sampling_logp_difference/max': 9.061508178710938, 'sampling/importance_sampling_ratio/min': 0.00011604782775975764, 'sampling/importance_sampling_ratio/mean': 0.9999524354934692, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8829544633626938, 'clip_ratio/low_mean': 2.1804387529300584e-05, 'clip_ratio/low_min': 3.918126822100021e-06, 'clip_ratio/high_mean': 2.287563575009699e-06, 'clip_ratio/high_max': 9.150254300038796e-06, 'clip_ratio/region_mean': 2.4091951559057634e-05, 'epoch': 0.49} + + 52%|█████▏ | 529/1024 [24:10:23<20:39:19, 150.22s/it]INFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:35:23 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 530/1024 [24:13:18<21:38:33, 157.72s/it] + {'loss': 0.0447, 'grad_norm': 0.0028944616205990314, 'learning_rate': 1e-05, 'num_tokens': 466648507.0, 'completions/mean_length': 7797.7109375, 'completions/min_length': 769.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7448.67431640625, 'completions/min_terminated_length': 769.0, 'completions/max_terminated_length': 15132.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.26303553581237793, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020830729976296425, 'sampling/sampling_logp_difference/max': 8.25, 'sampling/importance_sampling_ratio/min': 0.0002612585376482457, 'sampling/importance_sampling_ratio/mean': 0.999991774559021, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9747610911726952, 'clip_ratio/low_mean': 4.392900382299558e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.603994390592561e-06, 'clip_ratio/high_max': 2.3185014015325578e-05, 'clip_ratio/region_mean': 5.153299889570917e-05, 'epoch': 0.49} + + 52%|█████▏ | 530/1024 [24:13:18<21:38:33, 157.72s/it]INFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:38:18 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 531/1024 [24:15:57<21:37:52, 157.96s/it] + {'loss': 0.0573, 'grad_norm': 0.003612271510064602, 'learning_rate': 1e-05, 'num_tokens': 467487976.0, 'completions/mean_length': 6395.4765625, 'completions/min_length': 227.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6316.82666015625, 'completions/min_terminated_length': 227.0, 'completions/max_terminated_length': 16160.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.2664504945278168, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01959329843521118, 'sampling/sampling_logp_difference/max': 13.624999046325684, 'sampling/importance_sampling_ratio/min': 1.209868287332938e-06, 'sampling/importance_sampling_ratio/mean': 0.9998596906661987, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9015842452645302, 'clip_ratio/low_mean': 4.282657914700394e-05, 'clip_ratio/low_min': 4.545454430626705e-06, 'clip_ratio/high_mean': 3.7368648690971895e-06, 'clip_ratio/high_max': 1.4947459476388758e-05, 'clip_ratio/region_mean': 4.656344435716164e-05, 'epoch': 0.49} + + 52%|█████▏ | 531/1024 [24:15:57<21:37:52, 157.96s/it]INFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:40:56 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 532/1024 [24:18:50<22:14:13, 162.71s/it] + {'loss': 0.0104, 'grad_norm': 0.002104024635627866, 'learning_rate': 1e-05, 'num_tokens': 468445132.0, 'completions/mean_length': 7298.78125, 'completions/min_length': 770.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7154.57177734375, 'completions/min_terminated_length': 770.0, 'completions/max_terminated_length': 15694.0, 'rewards/accuracy_reward/mean': 0.2890625, 'rewards/accuracy_reward/std': 0.45510825514793396, 'reward': 0.2890625, 'reward_std': 0.2301519513130188, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021517785266041756, 'sampling/sampling_logp_difference/max': 9.872424125671387, 'sampling/importance_sampling_ratio/min': 5.157754640094936e-05, 'sampling/importance_sampling_ratio/mean': 0.9999783039093018, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9978953301906586, 'clip_ratio/low_mean': 1.8946868863167765e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 1.8946868863167765e-05, 'epoch': 0.49} + + 52%|█████▏ | 532/1024 [24:18:50<22:14:13, 162.71s/it]INFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:43:50 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 533/1024 [24:21:45<22:41:23, 166.36s/it] + {'loss': 0.0298, 'grad_norm': 0.0009346248698420823, 'learning_rate': 1e-05, 'num_tokens': 469360760.0, 'completions/mean_length': 7021.53125, 'completions/min_length': 693.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6561.08154296875, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 16003.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.20069600641727448, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020538944751024246, 'sampling/sampling_logp_difference/max': 5.8098626136779785, 'sampling/importance_sampling_ratio/min': 0.0029978419188410044, 'sampling/importance_sampling_ratio/mean': 0.9999547004699707, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9539581760764122, 'clip_ratio/low_mean': 3.0451521752183908e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.441706659643387e-06, 'clip_ratio/high_max': 2.0034196040796814e-05, 'clip_ratio/region_mean': 3.689322829814046e-05, 'epoch': 0.49} + + 52%|█████▏ | 533/1024 [24:21:45<22:41:23, 166.36s/it]INFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:46:45 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 534/1024 [24:24:15<21:58:43, 161.48s/it] + {'loss': 0.0522, 'grad_norm': 0.002331435214728117, 'learning_rate': 1e-05, 'num_tokens': 470274859.0, 'completions/mean_length': 6988.2109375, 'completions/min_length': 1047.0, 'completions/max_length': 15370.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6988.2109375, 'completions/min_terminated_length': 1047.0, 'completions/max_terminated_length': 15370.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.23751860857009888, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02088295854628086, 'sampling/sampling_logp_difference/max': 6.460330963134766, 'sampling/importance_sampling_ratio/min': 0.0015642779180780053, 'sampling/importance_sampling_ratio/mean': 1.000002145767212, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9471191540360451, 'clip_ratio/low_mean': 3.2224923302237585e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.968734807178407e-06, 'clip_ratio/high_max': 7.874939228713629e-06, 'clip_ratio/region_mean': 3.419365827994625e-05, 'epoch': 0.49} + + 52%|█████▏ | 534/1024 [24:24:15<21:58:43, 161.48s/it]INFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:49:15 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 535/1024 [24:27:10<22:27:59, 165.40s/it] + {'loss': 0.0617, 'grad_norm': 0.004562230780720711, 'learning_rate': 1e-05, 'num_tokens': 471263997.0, 'completions/mean_length': 7557.453125, 'completions/min_length': 1064.0, 'completions/max_length': 16212.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7557.453125, 'completions/min_terminated_length': 1064.0, 'completions/max_terminated_length': 16212.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.2511882185935974, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02160259149968624, 'sampling/sampling_logp_difference/max': 8.748924255371094, 'sampling/importance_sampling_ratio/min': 0.0001586318830959499, 'sampling/importance_sampling_ratio/mean': 1.000011920928955, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9897207245230675, 'clip_ratio/low_mean': 3.8229277151913266e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0911525641386106e-06, 'clip_ratio/high_max': 1.2364610256554442e-05, 'clip_ratio/region_mean': 4.132042954552162e-05, 'epoch': 0.49} + + 52%|█████▏ | 535/1024 [24:27:10<22:27:59, 165.40s/it]INFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:52:10 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 536/1024 [24:29:46<22:01:27, 162.48s/it] + {'loss': 0.0219, 'grad_norm': 0.004525062162429094, 'learning_rate': 1e-05, 'num_tokens': 472120622.0, 'completions/mean_length': 6532.1953125, 'completions/min_length': 737.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6295.75244140625, 'completions/min_terminated_length': 737.0, 'completions/max_terminated_length': 15603.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3487703502178192, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019527796655893326, 'sampling/sampling_logp_difference/max': 11.124346733093262, 'sampling/importance_sampling_ratio/min': 1.474883083574241e-05, 'sampling/importance_sampling_ratio/mean': 0.9999650120735168, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9109068289399147, 'clip_ratio/low_mean': 5.8747830053107464e-05, 'clip_ratio/low_min': 1.3906133062846493e-05, 'clip_ratio/high_mean': 7.420082738462952e-06, 'clip_ratio/high_max': 2.6050724500237266e-05, 'clip_ratio/region_mean': 6.616791324631777e-05, 'epoch': 0.49} + + 52%|█████▏ | 536/1024 [24:29:46<22:01:27, 162.48s/it]INFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:54:45 [block_pool.py:292] Successfully reset prefix cache + + 52%|█████▏ | 537/1024 [24:32:16<21:29:54, 158.92s/it] + {'loss': 0.0165, 'grad_norm': 0.005058468785136938, 'learning_rate': 1e-05, 'num_tokens': 472906346.0, 'completions/mean_length': 5994.40625, 'completions/min_length': 531.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5912.5986328125, 'completions/min_terminated_length': 531.0, 'completions/max_terminated_length': 15011.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.19044627249240875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.020568232983350754, 'sampling/sampling_logp_difference/max': 7.562398910522461, 'sampling/importance_sampling_ratio/min': 0.0005196271813474596, 'sampling/importance_sampling_ratio/mean': 0.9999456405639648, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9276224821805954, 'clip_ratio/low_mean': 3.90738064766083e-05, 'clip_ratio/low_min': 1.0626089533616323e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.90738064766083e-05, 'epoch': 0.49} + + 52%|█████▏ | 537/1024 [24:32:16<21:29:54, 158.92s/it]INFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:57:16 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 538/1024 [24:34:52<21:20:22, 158.07s/it] + {'loss': 0.1282, 'grad_norm': 0.007286665495485067, 'learning_rate': 1e-05, 'num_tokens': 473756256.0, 'completions/mean_length': 6469.046875, 'completions/min_length': 891.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6311.6669921875, 'completions/min_terminated_length': 891.0, 'completions/max_terminated_length': 15992.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.35772189497947693, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.019624462351202965, 'sampling/sampling_logp_difference/max': 9.681252479553223, 'sampling/importance_sampling_ratio/min': 6.244324322324246e-05, 'sampling/importance_sampling_ratio/mean': 1.0000038146972656, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9536962807178497, 'clip_ratio/low_mean': 5.992188062009518e-05, 'clip_ratio/low_min': 1.2131874427723233e-05, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.992188062009518e-05, 'epoch': 0.49} + + 53%|█████▎ | 538/1024 [24:34:52<21:20:22, 158.07s/it]INFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 13:59:52 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 539/1024 [24:37:03<20:10:27, 149.75s/it] + {'loss': -0.0091, 'grad_norm': 0.0031439310405403376, 'learning_rate': 1e-05, 'num_tokens': 474515194.0, 'completions/mean_length': 5778.703125, 'completions/min_length': 903.0, 'completions/max_length': 15383.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5778.703125, 'completions/min_terminated_length': 903.0, 'completions/max_terminated_length': 15383.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019796252250671387, 'sampling/sampling_logp_difference/max': 7.374977111816406, 'sampling/importance_sampling_ratio/min': 0.0006267410353757441, 'sampling/importance_sampling_ratio/mean': 1.0000576972961426, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9274095296859741, 'clip_ratio/low_mean': 3.329443018174061e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.504626536392607e-06, 'clip_ratio/high_max': 1.0018506145570427e-05, 'clip_ratio/region_mean': 3.57990563770727e-05, 'epoch': 0.5} + + 53%|█████▎ | 539/1024 [24:37:03<20:10:27, 149.75s/it]INFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:02:02 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 540/1024 [24:39:40<20:27:01, 152.11s/it] + {'loss': 0.0938, 'grad_norm': 0.0039032045751810074, 'learning_rate': 1e-05, 'num_tokens': 475355186.0, 'completions/mean_length': 6400.75, 'completions/min_length': 1015.0, 'completions/max_length': 16146.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6400.75, 'completions/min_terminated_length': 1015.0, 'completions/max_terminated_length': 16146.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.3135277032852173, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019878748804330826, 'sampling/sampling_logp_difference/max': 12.3806791305542, 'sampling/importance_sampling_ratio/min': 4.19893694925122e-06, 'sampling/importance_sampling_ratio/mean': 0.9999880194664001, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8927748426795006, 'clip_ratio/low_mean': 4.140612338687788e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.23904565297562e-06, 'clip_ratio/high_max': 3.1761268928676145e-05, 'clip_ratio/region_mean': 5.064516949460085e-05, 'epoch': 0.5} + + 53%|█████▎ | 540/1024 [24:39:40<20:27:01, 152.11s/it]INFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:04:40 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 541/1024 [24:42:30<21:08:14, 157.55s/it] + {'loss': 0.0642, 'grad_norm': 0.004979084711521864, 'learning_rate': 1e-05, 'num_tokens': 476289752.0, 'completions/mean_length': 7150.234375, 'completions/min_length': 1548.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6928.62451171875, 'completions/min_terminated_length': 1548.0, 'completions/max_terminated_length': 14347.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.3369181156158447, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019658904522657394, 'sampling/sampling_logp_difference/max': 7.75062894821167, 'sampling/importance_sampling_ratio/min': 0.0004304716712795198, 'sampling/importance_sampling_ratio/mean': 0.9999991059303284, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8632503524422646, 'clip_ratio/low_mean': 5.609390495919797e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.227385253827379e-06, 'clip_ratio/high_max': 2.524126966818585e-05, 'clip_ratio/region_mean': 6.332129100883321e-05, 'epoch': 0.5} + + 53%|█████▎ | 541/1024 [24:42:30<21:08:14, 157.55s/it]INFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:07:30 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 542/1024 [24:45:29<21:56:38, 163.90s/it] + {'loss': 0.0332, 'grad_norm': 0.003560611279681325, 'learning_rate': 1e-05, 'num_tokens': 477186885.0, 'completions/mean_length': 6855.6640625, 'completions/min_length': 771.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6704.4208984375, 'completions/min_terminated_length': 771.0, 'completions/max_terminated_length': 16353.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2743411958217621, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01880962960422039, 'sampling/sampling_logp_difference/max': 8.466726303100586, 'sampling/importance_sampling_ratio/min': 0.00021035241661593318, 'sampling/importance_sampling_ratio/mean': 0.9998643398284912, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8328540697693825, 'clip_ratio/low_mean': 3.922748987861269e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.324626497189456e-06, 'clip_ratio/high_max': 2.5298505988757825e-05, 'clip_ratio/region_mean': 4.555211648948898e-05, 'epoch': 0.5} + + 53%|█████▎ | 542/1024 [24:45:29<21:56:38, 163.90s/it]INFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:10:29 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 543/1024 [24:48:32<22:38:57, 169.52s/it] + {'loss': 0.0773, 'grad_norm': 0.0037869063671678305, 'learning_rate': 1e-05, 'num_tokens': 478121506.0, 'completions/mean_length': 7117.1015625, 'completions/min_length': 1067.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6818.1689453125, 'completions/min_terminated_length': 1067.0, 'completions/max_terminated_length': 15880.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2919674217700958, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.0203043594956398, 'sampling/sampling_logp_difference/max': 14.937435150146484, 'sampling/importance_sampling_ratio/min': 3.256524507833092e-07, 'sampling/importance_sampling_ratio/mean': 0.9999738931655884, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9280833601951599, 'clip_ratio/low_mean': 5.487640487444878e-05, 'clip_ratio/low_min': 6.345177553157555e-06, 'clip_ratio/high_mean': 2.226903745849995e-06, 'clip_ratio/high_max': 8.90761498339998e-06, 'clip_ratio/region_mean': 5.7103308108708006e-05, 'epoch': 0.5} + + 53%|█████▎ | 543/1024 [24:48:32<22:38:57, 169.52s/it]INFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:13:31 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 544/1024 [24:51:03<21:50:55, 163.87s/it] + {'loss': 0.0847, 'grad_norm': 0.002787451259791851, 'learning_rate': 1e-05, 'num_tokens': 479021365.0, 'completions/mean_length': 6885.7109375, 'completions/min_length': 1184.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6734.94482421875, 'completions/min_terminated_length': 1184.0, 'completions/max_terminated_length': 16046.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02060278132557869, 'sampling/sampling_logp_difference/max': 6.589450836181641, 'sampling/importance_sampling_ratio/min': 0.0013747947523370385, 'sampling/importance_sampling_ratio/mean': 1.0000042915344238, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9137701392173767, 'clip_ratio/low_mean': 3.976425330165512e-05, 'clip_ratio/low_min': 4.979286131856497e-06, 'clip_ratio/high_mean': 3.370686670223222e-06, 'clip_ratio/high_max': 1.3482746680892888e-05, 'clip_ratio/region_mean': 4.313493991503492e-05, 'epoch': 0.5} + + 53%|█████▎ | 544/1024 [24:51:03<21:50:55, 163.87s/it]INFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:16:02 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 545/1024 [24:53:55<22:07:57, 166.34s/it] + {'loss': 0.0225, 'grad_norm': 0.005555091425776482, 'learning_rate': 1e-05, 'num_tokens': 479951778.0, 'completions/mean_length': 7055.7265625, 'completions/min_length': 601.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6982.275390625, 'completions/min_terminated_length': 601.0, 'completions/max_terminated_length': 15047.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02176634594798088, 'sampling/sampling_logp_difference/max': 15.100777626037598, 'sampling/importance_sampling_ratio/min': 2.7657671353154e-07, 'sampling/importance_sampling_ratio/mean': 0.9999507665634155, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1009352952241898, 'clip_ratio/low_mean': 4.93504342102824e-05, 'clip_ratio/low_min': 5.1258921303087845e-06, 'clip_ratio/high_mean': 8.077826691987866e-06, 'clip_ratio/high_max': 2.918380459959735e-05, 'clip_ratio/region_mean': 5.742826124333078e-05, 'epoch': 0.5} + + 53%|█████▎ | 545/1024 [24:53:55<22:07:57, 166.34s/it]INFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:18:54 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 546/1024 [24:56:21<21:17:28, 160.35s/it] + {'loss': 0.1423, 'grad_norm': 0.00568060576915741, 'learning_rate': 1e-05, 'num_tokens': 480749677.0, 'completions/mean_length': 6088.2109375, 'completions/min_length': 528.0, 'completions/max_length': 16100.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6088.2109375, 'completions/min_terminated_length': 528.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.6484375, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.6484375, 'reward_std': 0.3729842007160187, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.017093103379011154, 'sampling/sampling_logp_difference/max': 8.437424659729004, 'sampling/importance_sampling_ratio/min': 0.0002166072663385421, 'sampling/importance_sampling_ratio/mean': 0.9999527931213379, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7534168809652328, 'clip_ratio/low_mean': 3.58120408918694e-05, 'clip_ratio/low_min': 5.571651399804978e-06, 'clip_ratio/high_mean': 2.43807289734832e-06, 'clip_ratio/high_max': 9.75229158939328e-06, 'clip_ratio/region_mean': 3.825011424396507e-05, 'epoch': 0.5} + + 53%|█████▎ | 546/1024 [24:56:21<21:17:28, 160.35s/it]INFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:21:21 [block_pool.py:292] Successfully reset prefix cache + + 53%|█████▎ | 547/1024 [24:58:50<20:47:46, 156.95s/it] + {'loss': 0.1025, 'grad_norm': 0.0019015485886484385, 'learning_rate': 1e-05, 'num_tokens': 481489954.0, 'completions/mean_length': 5638.8515625, 'completions/min_length': 1352.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5380.96826171875, 'completions/min_terminated_length': 1352.0, 'completions/max_terminated_length': 16029.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019102448597550392, 'sampling/sampling_logp_difference/max': 8.62470817565918, 'sampling/importance_sampling_ratio/min': 0.0001796126161934808, 'sampling/importance_sampling_ratio/mean': 0.999911904335022, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8868100792169571, 'clip_ratio/low_mean': 2.870424191314669e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.5532760850619525e-06, 'clip_ratio/high_max': 1.821310434024781e-05, 'clip_ratio/region_mean': 3.325751754346129e-05, 'epoch': 0.5} + + 53%|█████▎ | 547/1024 [24:58:50<20:47:46, 156.95s/it]INFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:23:50 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▎ | 548/1024 [25:01:28<20:47:06, 157.20s/it] + {'loss': 0.0642, 'grad_norm': 0.004203350283205509, 'learning_rate': 1e-05, 'num_tokens': 482375358.0, 'completions/mean_length': 6776.59375, 'completions/min_length': 588.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6624.095703125, 'completions/min_terminated_length': 588.0, 'completions/max_terminated_length': 15258.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.22567126154899597, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019327163696289062, 'sampling/sampling_logp_difference/max': 5.6320695877075195, 'sampling/importance_sampling_ratio/min': 0.0036098493728786707, 'sampling/importance_sampling_ratio/mean': 0.9999104738235474, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9075161814689636, 'clip_ratio/low_mean': 3.169551814607985e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.0229532411758555e-06, 'clip_ratio/high_max': 2.3414544557454064e-05, 'clip_ratio/region_mean': 3.8718471842003055e-05, 'epoch': 0.5} + + 54%|█████▎ | 548/1024 [25:01:28<20:47:06, 157.20s/it]INFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:26:27 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▎ | 549/1024 [25:04:22<21:25:56, 162.44s/it] + {'loss': 0.0499, 'grad_norm': 0.004891456104815006, 'learning_rate': 1e-05, 'num_tokens': 483357450.0, 'completions/mean_length': 7507.59375, 'completions/min_length': 774.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7071.048828125, 'completions/min_terminated_length': 774.0, 'completions/max_terminated_length': 15684.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2772369980812073, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019086822867393494, 'sampling/sampling_logp_difference/max': 5.721317291259766, 'sampling/importance_sampling_ratio/min': 0.0032753932755440474, 'sampling/importance_sampling_ratio/mean': 0.9999200701713562, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8015655726194382, 'clip_ratio/low_mean': 3.6077018648938974e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.84939061809564e-06, 'clip_ratio/high_max': 1.8746226487564854e-05, 'clip_ratio/region_mean': 4.192640903966094e-05, 'epoch': 0.51} + + 54%|█████▎ | 549/1024 [25:04:22<21:25:56, 162.44s/it]INFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:29:22 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▎ | 550/1024 [25:07:10<21:34:52, 163.91s/it] + {'loss': 0.028, 'grad_norm': 0.003564947983250022, 'learning_rate': 1e-05, 'num_tokens': 484153554.0, 'completions/mean_length': 6061.3125, 'completions/min_length': 627.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5813.568359375, 'completions/min_terminated_length': 627.0, 'completions/max_terminated_length': 16107.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018360145390033722, 'sampling/sampling_logp_difference/max': 3.908921003341675, 'sampling/importance_sampling_ratio/min': 0.02006213553249836, 'sampling/importance_sampling_ratio/mean': 0.9999876022338867, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8335569724440575, 'clip_ratio/low_mean': 3.096040018135682e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.011492757806991e-06, 'clip_ratio/high_max': 2.4045971031227964e-05, 'clip_ratio/region_mean': 3.697189299600723e-05, 'epoch': 0.51} + + 54%|█████▎ | 550/1024 [25:07:10<21:34:52, 163.91s/it]INFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:32:09 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 551/1024 [25:10:11<22:11:51, 168.95s/it] + {'loss': 0.0976, 'grad_norm': 0.0032013265881687403, 'learning_rate': 1e-05, 'num_tokens': 485111601.0, 'completions/mean_length': 7312.4921875, 'completions/min_length': 588.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7241.06298828125, 'completions/min_terminated_length': 588.0, 'completions/max_terminated_length': 15957.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.21040895581245422, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020712960511446, 'sampling/sampling_logp_difference/max': 5.0278730392456055, 'sampling/importance_sampling_ratio/min': 0.006552733480930328, 'sampling/importance_sampling_ratio/mean': 0.9999306201934814, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9900097697973251, 'clip_ratio/low_mean': 4.612986276697484e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2738347524864366e-06, 'clip_ratio/high_max': 9.095339009945747e-06, 'clip_ratio/region_mean': 4.840369865632965e-05, 'epoch': 0.51} + + 54%|█████▍ | 551/1024 [25:10:11<22:11:51, 168.95s/it]INFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:35:10 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 552/1024 [25:12:33<21:06:55, 161.05s/it] + {'loss': 0.0888, 'grad_norm': 0.002972986316308379, 'learning_rate': 1e-05, 'num_tokens': 485971554.0, 'completions/mean_length': 6571.4453125, 'completions/min_length': 951.0, 'completions/max_length': 14797.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6571.4453125, 'completions/min_terminated_length': 951.0, 'completions/max_terminated_length': 14797.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020055105909705162, 'sampling/sampling_logp_difference/max': 10.613155364990234, 'sampling/importance_sampling_ratio/min': 2.4590379325672984e-05, 'sampling/importance_sampling_ratio/mean': 0.9998995065689087, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8801060244441032, 'clip_ratio/low_mean': 4.3424448904261226e-05, 'clip_ratio/low_min': 4.718405762105249e-06, 'clip_ratio/high_mean': 4.2937051603075815e-06, 'clip_ratio/high_max': 1.360053283860907e-05, 'clip_ratio/region_mean': 4.771815429194248e-05, 'epoch': 0.51} + + 54%|█████▍ | 552/1024 [25:12:33<21:06:55, 161.05s/it]INFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:37:33 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 553/1024 [25:15:12<20:59:42, 160.47s/it] + {'loss': 0.0278, 'grad_norm': 0.00798189826309681, 'learning_rate': 1e-05, 'num_tokens': 486873791.0, 'completions/mean_length': 6879.2890625, 'completions/min_length': 430.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6728.4208984375, 'completions/min_terminated_length': 430.0, 'completions/max_terminated_length': 16243.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.22673210501670837, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02010834403336048, 'sampling/sampling_logp_difference/max': 5.25710916519165, 'sampling/importance_sampling_ratio/min': 0.005210345610976219, 'sampling/importance_sampling_ratio/mean': 0.9999493956565857, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8452998399734497, 'clip_ratio/low_mean': 3.511405452627514e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.057813901501504e-06, 'clip_ratio/high_max': 8.231255606006016e-06, 'clip_ratio/region_mean': 3.71718685983069e-05, 'epoch': 0.51} + + 54%|█████▍ | 553/1024 [25:15:12<20:59:42, 160.47s/it]INFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:40:12 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 554/1024 [25:17:52<20:54:13, 160.11s/it] + {'loss': 0.0389, 'grad_norm': 0.0038661460857838392, 'learning_rate': 1e-05, 'num_tokens': 487814936.0, 'completions/mean_length': 7169.8828125, 'completions/min_length': 694.0, 'completions/max_length': 16237.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7169.8828125, 'completions/min_terminated_length': 694.0, 'completions/max_terminated_length': 16237.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.23751862347126007, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02097059041261673, 'sampling/sampling_logp_difference/max': 9.96898078918457, 'sampling/importance_sampling_ratio/min': 4.6830271458020434e-05, 'sampling/importance_sampling_ratio/mean': 0.9999849796295166, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9671438857913017, 'clip_ratio/low_mean': 6.0756912262149854e-05, 'clip_ratio/low_min': 1.0878021839744179e-05, 'clip_ratio/high_mean': 4.394269467411505e-06, 'clip_ratio/high_max': 1.757707786964602e-05, 'clip_ratio/region_mean': 6.51511809337535e-05, 'epoch': 0.51} + + 54%|█████▍ | 554/1024 [25:17:52<20:54:13, 160.11s/it]INFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:42:51 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 555/1024 [25:20:26<20:38:25, 158.43s/it] + {'loss': 0.0252, 'grad_norm': 0.002214127918705344, 'learning_rate': 1e-05, 'num_tokens': 488720293.0, 'completions/mean_length': 6945.0390625, 'completions/min_length': 940.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6870.71630859375, 'completions/min_terminated_length': 940.0, 'completions/max_terminated_length': 15458.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01968962326645851, 'sampling/sampling_logp_difference/max': 8.04468059539795, 'sampling/importance_sampling_ratio/min': 0.00032080389792099595, 'sampling/importance_sampling_ratio/mean': 0.9999914169311523, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9309702143073082, 'clip_ratio/low_mean': 3.180719090778439e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1623150157902273e-06, 'clip_ratio/high_max': 4.649260063160909e-06, 'clip_ratio/region_mean': 3.2969506037261453e-05, 'epoch': 0.51} + + 54%|█████▍ | 555/1024 [25:20:26<20:38:25, 158.43s/it]INFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:45:26 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 556/1024 [25:22:57<20:18:40, 156.24s/it] + {'loss': 0.0919, 'grad_norm': 0.0026088031008839607, 'learning_rate': 1e-05, 'num_tokens': 489504626.0, 'completions/mean_length': 5970.1015625, 'completions/min_length': 610.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5804.8017578125, 'completions/min_terminated_length': 610.0, 'completions/max_terminated_length': 16332.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3237725496292114, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018132124096155167, 'sampling/sampling_logp_difference/max': 7.999942779541016, 'sampling/importance_sampling_ratio/min': 0.00033548183273524046, 'sampling/importance_sampling_ratio/mean': 0.9999892711639404, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8274230882525444, 'clip_ratio/low_mean': 5.9988536690980254e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.857000706375402e-06, 'clip_ratio/high_max': 1.5428002825501608e-05, 'clip_ratio/region_mean': 6.384553716998198e-05, 'epoch': 0.51} + + 54%|█████▍ | 556/1024 [25:22:57<20:18:40, 156.24s/it]INFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:47:57 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 557/1024 [25:25:45<20:44:11, 159.85s/it] + {'loss': 0.0021, 'grad_norm': 0.0040014018304646015, 'learning_rate': 1e-05, 'num_tokens': 490431156.0, 'completions/mean_length': 7099.578125, 'completions/min_length': 567.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6952.20654296875, 'completions/min_terminated_length': 567.0, 'completions/max_terminated_length': 15636.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.25460803508758545, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02036934345960617, 'sampling/sampling_logp_difference/max': 7.249959468841553, 'sampling/importance_sampling_ratio/min': 0.0007102031959220767, 'sampling/importance_sampling_ratio/mean': 0.9999368786811829, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8690815567970276, 'clip_ratio/low_mean': 3.257978141846252e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.032566036788921e-06, 'clip_ratio/high_max': 1.628765676287003e-05, 'clip_ratio/region_mean': 3.761234722787776e-05, 'epoch': 0.51} + + 54%|█████▍ | 557/1024 [25:25:45<20:44:11, 159.85s/it]INFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:50:45 [block_pool.py:292] Successfully reset prefix cache + + 54%|█████▍ | 558/1024 [25:28:42<21:19:19, 164.72s/it] + {'loss': 0.0711, 'grad_norm': 0.002252641599625349, 'learning_rate': 1e-05, 'num_tokens': 491378450.0, 'completions/mean_length': 7253.296875, 'completions/min_length': 727.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6725.07421875, 'completions/min_terminated_length': 727.0, 'completions/max_terminated_length': 16301.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2488291710615158, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01926814392209053, 'sampling/sampling_logp_difference/max': 10.87448501586914, 'sampling/importance_sampling_ratio/min': 1.893525586638134e-05, 'sampling/importance_sampling_ratio/mean': 0.9999855756759644, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8692722395062447, 'clip_ratio/low_mean': 3.747020150512981e-05, 'clip_ratio/low_min': 3.852436293527717e-06, 'clip_ratio/high_mean': 3.3287286100858182e-06, 'clip_ratio/high_max': 1.3314914440343273e-05, 'clip_ratio/region_mean': 4.079892983099853e-05, 'epoch': 0.51} + + 54%|█████▍ | 558/1024 [25:28:42<21:19:19, 164.72s/it]INFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:53:41 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▍ | 559/1024 [25:31:40<21:49:03, 168.91s/it] + {'loss': 0.0684, 'grad_norm': 0.0023995323572307825, 'learning_rate': 1e-05, 'num_tokens': 492398757.0, 'completions/mean_length': 7827.0234375, 'completions/min_length': 808.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7406.18798828125, 'completions/min_terminated_length': 808.0, 'completions/max_terminated_length': 15865.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.26826781034469604, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020725054666399956, 'sampling/sampling_logp_difference/max': 7.951230525970459, 'sampling/importance_sampling_ratio/min': 0.0003522284678183496, 'sampling/importance_sampling_ratio/mean': 0.9999961256980896, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9718392416834831, 'clip_ratio/low_mean': 3.905345306520758e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0567253070803417e-05, 'clip_ratio/high_max': 3.51339258486405e-05, 'clip_ratio/region_mean': 4.962070602232416e-05, 'epoch': 0.51} + + 55%|█████▍ | 559/1024 [25:31:40<21:49:03, 168.91s/it]INFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:56:40 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▍ | 560/1024 [25:34:12<21:05:35, 163.65s/it] + {'loss': 0.0298, 'grad_norm': 0.0053934333845973015, 'learning_rate': 1e-05, 'num_tokens': 493259049.0, 'completions/mean_length': 6578.53125, 'completions/min_length': 80.0, 'completions/max_length': 14833.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6578.53125, 'completions/min_terminated_length': 80.0, 'completions/max_terminated_length': 14833.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.29196250438690186, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019497254863381386, 'sampling/sampling_logp_difference/max': 13.345943450927734, 'sampling/importance_sampling_ratio/min': 1.5993017541404697e-06, 'sampling/importance_sampling_ratio/mean': 0.999976396560669, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9265799149870872, 'clip_ratio/low_mean': 4.477454979223694e-05, 'clip_ratio/low_min': 3.5987793580716243e-06, 'clip_ratio/high_mean': 2.3092504193300556e-06, 'clip_ratio/high_max': 9.237001677320222e-06, 'clip_ratio/region_mean': 4.708380049578409e-05, 'epoch': 0.52} + + 55%|█████▍ | 560/1024 [25:34:12<21:05:35, 163.65s/it]INFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 14:59:11 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▍ | 561/1024 [25:37:03<21:20:12, 165.90s/it] + {'loss': 0.061, 'grad_norm': 0.003773769596591592, 'learning_rate': 1e-05, 'num_tokens': 494288028.0, 'completions/mean_length': 7893.7734375, 'completions/min_length': 763.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7826.92138671875, 'completions/min_terminated_length': 763.0, 'completions/max_terminated_length': 15783.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.29272884130477905, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020743828266859055, 'sampling/sampling_logp_difference/max': 9.982173919677734, 'sampling/importance_sampling_ratio/min': 4.6216489863581955e-05, 'sampling/importance_sampling_ratio/mean': 1.0000444650650024, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9697273746132851, 'clip_ratio/low_mean': 4.2538599473118666e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.580789669082151e-06, 'clip_ratio/high_max': 6.991247119003674e-06, 'clip_ratio/region_mean': 4.511938891482714e-05, 'epoch': 0.52} + + 55%|█████▍ | 561/1024 [25:37:03<21:20:12, 165.90s/it]INFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:02:02 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▍ | 562/1024 [25:39:55<21:32:50, 167.90s/it] + {'loss': 0.0217, 'grad_norm': 0.006334445904940367, 'learning_rate': 1e-05, 'num_tokens': 495135903.0, 'completions/mean_length': 6483.7734375, 'completions/min_length': 1030.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6405.81884765625, 'completions/min_terminated_length': 1030.0, 'completions/max_terminated_length': 15024.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.20251333713531494, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018669776618480682, 'sampling/sampling_logp_difference/max': 8.99797248840332, 'sampling/importance_sampling_ratio/min': 0.0001236602693097666, 'sampling/importance_sampling_ratio/mean': 0.9999064207077026, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8293593674898148, 'clip_ratio/low_mean': 3.2997783137034276e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.29665919909894e-06, 'clip_ratio/high_max': 1.060595786839258e-05, 'clip_ratio/region_mean': 3.729444244982005e-05, 'epoch': 0.52} + + 55%|█████▍ | 562/1024 [25:39:55<21:32:50, 167.90s/it]INFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:04:55 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▍ | 563/1024 [25:42:43<21:29:09, 167.79s/it] + {'loss': 0.0865, 'grad_norm': 0.003286323742941022, 'learning_rate': 1e-05, 'num_tokens': 495986277.0, 'completions/mean_length': 6484.546875, 'completions/min_length': 630.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6246.96044921875, 'completions/min_terminated_length': 630.0, 'completions/max_terminated_length': 16230.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.3763991594314575, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018656805157661438, 'sampling/sampling_logp_difference/max': 10.809014320373535, 'sampling/importance_sampling_ratio/min': 2.0216441043885425e-05, 'sampling/importance_sampling_ratio/mean': 0.999945342540741, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7686850279569626, 'clip_ratio/low_mean': 4.667806888392079e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.3393192236653704e-06, 'clip_ratio/high_max': 9.357276894661481e-06, 'clip_ratio/region_mean': 4.901738748230855e-05, 'epoch': 0.52} + + 55%|█████▍ | 563/1024 [25:42:43<21:29:09, 167.79s/it]INFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:07:42 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▌ | 564/1024 [25:45:27<21:18:53, 166.81s/it] + {'loss': -0.0049, 'grad_norm': 0.005072349216789007, 'learning_rate': 1e-05, 'num_tokens': 496826094.0, 'completions/mean_length': 6411.3203125, 'completions/min_length': 952.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 5746.47509765625, 'completions/min_terminated_length': 952.0, 'completions/max_terminated_length': 15720.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019648944959044456, 'sampling/sampling_logp_difference/max': 5.5721211433410645, 'sampling/importance_sampling_ratio/min': 0.0038024066016077995, 'sampling/importance_sampling_ratio/mean': 0.9999135732650757, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.899998240172863, 'clip_ratio/low_mean': 8.26880966542376e-06, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.76577109668142e-06, 'clip_ratio/high_max': 3.368905208844808e-05, 'clip_ratio/region_mean': 1.8034580989478854e-05, 'epoch': 0.52} + + 55%|█████▌ | 564/1024 [25:45:27<21:18:53, 166.81s/it]INFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:10:27 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▌ | 565/1024 [25:48:34<22:02:38, 172.89s/it] + {'loss': 0.0871, 'grad_norm': 0.005030680447816849, 'learning_rate': 1e-05, 'num_tokens': 497756469.0, 'completions/mean_length': 7110.0546875, 'completions/min_length': 686.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6810.89501953125, 'completions/min_terminated_length': 686.0, 'completions/max_terminated_length': 16300.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3253750801086426, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.02187274768948555, 'sampling/sampling_logp_difference/max': 7.749985218048096, 'sampling/importance_sampling_ratio/min': 0.0004307488852646202, 'sampling/importance_sampling_ratio/mean': 0.999985933303833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0061073675751686, 'clip_ratio/low_mean': 4.834715275592316e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.2551004020861e-06, 'clip_ratio/high_max': 1.726673963275971e-05, 'clip_ratio/region_mean': 5.4602252930635586e-05, 'epoch': 0.52} + + 55%|█████▌ | 565/1024 [25:48:34<22:02:38, 172.89s/it]INFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:13:34 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▌ | 566/1024 [25:51:21<21:44:25, 170.89s/it] + {'loss': -0.0016, 'grad_norm': 0.002894402015954256, 'learning_rate': 1e-05, 'num_tokens': 498743411.0, 'completions/mean_length': 7546.484375, 'completions/min_length': 405.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7261.40283203125, 'completions/min_terminated_length': 405.0, 'completions/max_terminated_length': 16100.0, 'rewards/accuracy_reward/mean': 0.25, 'rewards/accuracy_reward/std': 0.434714138507843, 'reward': 0.25, 'reward_std': 0.2380426526069641, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019597206264734268, 'sampling/sampling_logp_difference/max': 10.306904792785645, 'sampling/importance_sampling_ratio/min': 3.340166585985571e-05, 'sampling/importance_sampling_ratio/mean': 0.9998988509178162, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.898541085422039, 'clip_ratio/low_mean': 2.627351494766117e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.480095630147844e-07, 'clip_ratio/high_max': 3.3920382520591374e-06, 'clip_ratio/region_mean': 2.712152416961544e-05, 'epoch': 0.52} + + 55%|█████▌ | 566/1024 [25:51:21<21:44:25, 170.89s/it]INFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:16:20 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▌ | 567/1024 [25:54:09<21:35:22, 170.07s/it] + {'loss': 0.0352, 'grad_norm': 0.0033100086729973555, 'learning_rate': 1e-05, 'num_tokens': 499612490.0, 'completions/mean_length': 6637.9296875, 'completions/min_length': 340.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6241.74755859375, 'completions/min_terminated_length': 340.0, 'completions/max_terminated_length': 15426.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2782978415489197, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019627269357442856, 'sampling/sampling_logp_difference/max': 8.448633193969727, 'sampling/importance_sampling_ratio/min': 0.000214192972634919, 'sampling/importance_sampling_ratio/mean': 0.9999792575836182, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9469815120100975, 'clip_ratio/low_mean': 1.9815101950371172e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.518700269632973e-07, 'clip_ratio/high_max': 3.407480107853189e-06, 'clip_ratio/region_mean': 2.066697197733447e-05, 'epoch': 0.52} + + 55%|█████▌ | 567/1024 [25:54:09<21:35:22, 170.07s/it]INFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:19:08 [block_pool.py:292] Successfully reset prefix cache + + 55%|█████▌ | 568/1024 [25:56:55<21:23:47, 168.92s/it] + {'loss': 0.0543, 'grad_norm': 0.006571728736162186, 'learning_rate': 1e-05, 'num_tokens': 500515117.0, 'completions/mean_length': 6903.0859375, 'completions/min_length': 602.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6752.595703125, 'completions/min_terminated_length': 602.0, 'completions/max_terminated_length': 15136.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3335031569004059, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.020653847604990005, 'sampling/sampling_logp_difference/max': 4.107652187347412, 'sampling/importance_sampling_ratio/min': 0.016446342691779137, 'sampling/importance_sampling_ratio/mean': 0.9999945163726807, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.976447619497776, 'clip_ratio/low_mean': 6.551078422489809e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.2405809419251455e-06, 'clip_ratio/high_max': 2.8962323767700582e-05, 'clip_ratio/region_mean': 7.275136522366665e-05, 'epoch': 0.52} + + 55%|█████▌ | 568/1024 [25:56:55<21:23:47, 168.92s/it]INFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:21:55 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▌ | 569/1024 [25:59:41<21:14:02, 168.01s/it] + {'loss': 0.0618, 'grad_norm': 0.007468517404049635, 'learning_rate': 1e-05, 'num_tokens': 501427056.0, 'completions/mean_length': 6953.8359375, 'completions/min_length': 88.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6570.49560546875, 'completions/min_terminated_length': 88.0, 'completions/max_terminated_length': 15556.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3571978807449341, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01937997341156006, 'sampling/sampling_logp_difference/max': 8.562470436096191, 'sampling/importance_sampling_ratio/min': 0.0001911464933073148, 'sampling/importance_sampling_ratio/mean': 1.0000053644180298, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8397975340485573, 'clip_ratio/low_mean': 7.513643731726916e-05, 'clip_ratio/low_min': 2.2551557776750997e-05, 'clip_ratio/high_mean': 3.6441037991608027e-06, 'clip_ratio/high_max': 1.4576415196643211e-05, 'clip_ratio/region_mean': 7.878054020693526e-05, 'epoch': 0.52} + + 56%|█████▌ | 569/1024 [25:59:41<21:14:02, 168.01s/it]INFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:24:41 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▌ | 570/1024 [26:02:41<21:37:37, 171.49s/it] + {'loss': 0.0431, 'grad_norm': 0.004324767272919416, 'learning_rate': 1e-05, 'num_tokens': 502445156.0, 'completions/mean_length': 7807.09375, 'completions/min_length': 562.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7458.43896484375, 'completions/min_terminated_length': 562.0, 'completions/max_terminated_length': 15961.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.3329663574695587, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018592730164527893, 'sampling/sampling_logp_difference/max': 10.418506622314453, 'sampling/importance_sampling_ratio/min': 2.9874459869461134e-05, 'sampling/importance_sampling_ratio/mean': 0.9999243021011353, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7974586114287376, 'clip_ratio/low_mean': 3.7468206755875144e-05, 'clip_ratio/low_min': 5.264044375508092e-06, 'clip_ratio/high_mean': 7.922306224372733e-06, 'clip_ratio/high_max': 3.168922489749093e-05, 'clip_ratio/region_mean': 4.5390514060272835e-05, 'epoch': 0.52} + + 56%|█████▌ | 570/1024 [26:02:41<21:37:37, 171.49s/it]INFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:27:40 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▌ | 571/1024 [26:05:30<21:29:19, 170.77s/it] + {'loss': 0.0434, 'grad_norm': 0.0044867550022900105, 'learning_rate': 1e-05, 'num_tokens': 503293398.0, 'completions/mean_length': 6467.890625, 'completions/min_length': 874.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6310.4921875, 'completions/min_terminated_length': 874.0, 'completions/max_terminated_length': 16133.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.2998581528663635, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019022464752197266, 'sampling/sampling_logp_difference/max': 3.6936450004577637, 'sampling/importance_sampling_ratio/min': 0.024881144985556602, 'sampling/importance_sampling_ratio/mean': 0.999916136264801, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8665193468332291, 'clip_ratio/low_mean': 3.436269958001503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.567038670051261e-06, 'clip_ratio/high_max': 1.8414293663227e-05, 'clip_ratio/region_mean': 3.9929738250066293e-05, 'epoch': 0.53} + + 56%|█████▌ | 571/1024 [26:05:30<21:29:19, 170.77s/it]INFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:30:29 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▌ | 572/1024 [26:08:32<21:52:04, 174.17s/it] + {'loss': 0.0041, 'grad_norm': 0.0033805551938712597, 'learning_rate': 1e-05, 'num_tokens': 504115692.0, 'completions/mean_length': 6275.796875, 'completions/min_length': 517.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6115.349609375, 'completions/min_terminated_length': 517.0, 'completions/max_terminated_length': 16309.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.2569621503353119, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018935590982437134, 'sampling/sampling_logp_difference/max': 3.9959733486175537, 'sampling/importance_sampling_ratio/min': 0.018389537930488586, 'sampling/importance_sampling_ratio/mean': 1.0000152587890625, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8425783589482307, 'clip_ratio/low_mean': 3.597185968828853e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.711462454702996e-06, 'clip_ratio/high_max': 1.4845849818811985e-05, 'clip_ratio/region_mean': 3.968332202930469e-05, 'epoch': 0.53} + + 56%|█████▌ | 572/1024 [26:08:32<21:52:04, 174.17s/it]INFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:33:31 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▌ | 573/1024 [26:11:14<21:23:14, 170.72s/it] + {'loss': 0.0695, 'grad_norm': 0.00652205478399992, 'learning_rate': 1e-05, 'num_tokens': 504826577.0, 'completions/mean_length': 5396.7890625, 'completions/min_length': 662.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5222.38916015625, 'completions/min_terminated_length': 662.0, 'completions/max_terminated_length': 16116.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018737314268946648, 'sampling/sampling_logp_difference/max': 6.373790740966797, 'sampling/importance_sampling_ratio/min': 0.0017056812066584826, 'sampling/importance_sampling_ratio/mean': 0.9999775886535645, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8558806329965591, 'clip_ratio/low_mean': 1.670091853611666e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.3471904480866215e-05, 'clip_ratio/high_max': 4.3129479763592826e-05, 'clip_ratio/region_mean': 3.0172822903296037e-05, 'epoch': 0.53} + + 56%|█████▌ | 573/1024 [26:11:14<21:23:14, 170.72s/it]INFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:36:14 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▌ | 574/1024 [26:14:25<22:04:14, 176.57s/it] + {'loss': 0.0698, 'grad_norm': 0.0018958896398544312, 'learning_rate': 1e-05, 'num_tokens': 505846438.0, 'completions/mean_length': 7798.9765625, 'completions/min_length': 319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 6991.837890625, 'completions/min_terminated_length': 319.0, 'completions/max_terminated_length': 16298.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.21253062784671783, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019361287355422974, 'sampling/sampling_logp_difference/max': 10.623047828674316, 'sampling/importance_sampling_ratio/min': 2.434831731079612e-05, 'sampling/importance_sampling_ratio/mean': 0.9999515414237976, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8846152648329735, 'clip_ratio/low_mean': 2.3435458388121333e-05, 'clip_ratio/low_min': 3.954319709009724e-06, 'clip_ratio/high_mean': 1.728673169054673e-06, 'clip_ratio/high_max': 6.914692676218692e-06, 'clip_ratio/region_mean': 2.5164132239297032e-05, 'epoch': 0.53} + + 56%|█████▌ | 574/1024 [26:14:25<22:04:14, 176.57s/it]INFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:39:24 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▌ | 575/1024 [26:17:12<21:41:14, 173.88s/it] + {'loss': 0.1295, 'grad_norm': 0.003035407979041338, 'learning_rate': 1e-05, 'num_tokens': 506670477.0, 'completions/mean_length': 6272.5546875, 'completions/min_length': 901.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6029.88037109375, 'completions/min_terminated_length': 901.0, 'completions/max_terminated_length': 16280.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019988738000392914, 'sampling/sampling_logp_difference/max': 6.716870307922363, 'sampling/importance_sampling_ratio/min': 0.0012103202752768993, 'sampling/importance_sampling_ratio/mean': 0.9999212026596069, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9714803844690323, 'clip_ratio/low_mean': 5.590463968019321e-05, 'clip_ratio/low_min': 4.822531082027126e-06, 'clip_ratio/high_mean': 5.064732249593362e-06, 'clip_ratio/high_max': 1.085428675651201e-05, 'clip_ratio/region_mean': 6.096937283928128e-05, 'epoch': 0.53} + + 56%|█████▌ | 575/1024 [26:17:12<21:41:14, 173.88s/it]INFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:42:12 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▋ | 576/1024 [26:19:59<21:22:16, 171.73s/it] + {'loss': 0.06, 'grad_norm': 0.005080445669591427, 'learning_rate': 1e-05, 'num_tokens': 507471717.0, 'completions/mean_length': 6060.75, 'completions/min_length': 593.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5896.88916015625, 'completions/min_terminated_length': 593.0, 'completions/max_terminated_length': 16115.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3135228157043457, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019146449863910675, 'sampling/sampling_logp_difference/max': 5.961174488067627, 'sampling/importance_sampling_ratio/min': 0.0025768836494535208, 'sampling/importance_sampling_ratio/mean': 0.9999859929084778, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8791732639074326, 'clip_ratio/low_mean': 4.479086726405512e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.294149900691991e-06, 'clip_ratio/high_max': 2.1176599602767965e-05, 'clip_ratio/region_mean': 5.008501784686814e-05, 'epoch': 0.53} + + 56%|█████▋ | 576/1024 [26:19:59<21:22:16, 171.73s/it]INFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:45:01 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 56%|█████▋ | 577/1024 [26:22:47<21:12:08, 170.76s/it] + {'loss': 0.0263, 'grad_norm': 0.002491918858140707, 'learning_rate': 1e-05, 'num_tokens': 508420417.0, 'completions/mean_length': 7221.65625, 'completions/min_length': 1071.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7149.51171875, 'completions/min_terminated_length': 1071.0, 'completions/max_terminated_length': 16319.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.22908622026443481, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019857721403241158, 'sampling/sampling_logp_difference/max': 6.906219959259033, 'sampling/importance_sampling_ratio/min': 0.0010015364969149232, 'sampling/importance_sampling_ratio/mean': 0.9999144077301025, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9068904295563698, 'clip_ratio/low_mean': 3.991827338722942e-05, 'clip_ratio/low_min': 4.394445568323135e-06, 'clip_ratio/high_mean': 3.978321103659255e-06, 'clip_ratio/high_max': 1.591328441463702e-05, 'clip_ratio/region_mean': 4.389659511616628e-05, 'epoch': 0.53} + + 56%|█████▋ | 577/1024 [26:22:47<21:12:08, 170.76s/it]INFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:47:47 [block_pool.py:292] Successfully reset prefix cache + + 56%|█████▋ | 578/1024 [26:25:40<21:13:21, 171.30s/it] + {'loss': 0.1167, 'grad_norm': 0.0038857783656567335, 'learning_rate': 1e-05, 'num_tokens': 509367579.0, 'completions/mean_length': 7279.765625, 'completions/min_length': 754.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6909.67431640625, 'completions/min_terminated_length': 754.0, 'completions/max_terminated_length': 16090.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.3782213628292084, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.01783195324242115, 'sampling/sampling_logp_difference/max': 9.374939918518066, 'sampling/importance_sampling_ratio/min': 8.482332486892119e-05, 'sampling/importance_sampling_ratio/mean': 0.9999372959136963, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7393763959407806, 'clip_ratio/low_mean': 4.729307283923845e-05, 'clip_ratio/low_min': 3.3817600524344016e-06, 'clip_ratio/high_mean': 6.80946584452613e-07, 'clip_ratio/high_max': 2.723786337810452e-06, 'clip_ratio/region_mean': 4.7974018798413454e-05, 'epoch': 0.53} + + 56%|█████▋ | 578/1024 [26:25:40<21:13:21, 171.30s/it]INFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:50:40 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 579/1024 [26:27:57<19:53:39, 160.94s/it] + {'loss': 0.1534, 'grad_norm': 0.004505726508796215, 'learning_rate': 1e-05, 'num_tokens': 510076403.0, 'completions/mean_length': 5381.4375, 'completions/min_length': 1030.0, 'completions/max_length': 15946.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5381.4375, 'completions/min_terminated_length': 1030.0, 'completions/max_terminated_length': 15946.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3861297369003296, 'frac_reward_zero_std': 0.0625, 'sampling/sampling_logp_difference/mean': 0.019285976886749268, 'sampling/sampling_logp_difference/max': 6.124998569488525, 'sampling/importance_sampling_ratio/min': 0.0021874941885471344, 'sampling/importance_sampling_ratio/mean': 0.9999825358390808, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8337196409702301, 'clip_ratio/low_mean': 5.770765028501046e-05, 'clip_ratio/low_min': 6.032236342434771e-06, 'clip_ratio/high_mean': 6.067322146918741e-06, 'clip_ratio/high_max': 2.4269288587674964e-05, 'clip_ratio/region_mean': 6.377497174980817e-05, 'epoch': 0.53} + + 57%|█████▋ | 579/1024 [26:27:57<19:53:39, 160.94s/it]INFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:52:56 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 580/1024 [26:30:58<20:35:17, 166.93s/it] + {'loss': 0.0288, 'grad_norm': 0.0039497604593634605, 'learning_rate': 1e-05, 'num_tokens': 511177974.0, 'completions/mean_length': 8440.7109375, 'completions/min_length': 472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 8250.072265625, 'completions/min_terminated_length': 472.0, 'completions/max_terminated_length': 15789.0, 'rewards/accuracy_reward/mean': 0.1875, 'rewards/accuracy_reward/std': 0.39184603095054626, 'reward': 0.1875, 'reward_std': 0.18990950286388397, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020451124757528305, 'sampling/sampling_logp_difference/max': 8.424702644348145, 'sampling/importance_sampling_ratio/min': 0.00021938055579084903, 'sampling/importance_sampling_ratio/mean': 0.999910831451416, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8920768201351166, 'clip_ratio/low_mean': 4.1738339632502175e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.577795834848075e-06, 'clip_ratio/high_max': 1.83111833393923e-05, 'clip_ratio/region_mean': 4.631613546735025e-05, 'epoch': 0.53} + + 57%|█████▋ | 580/1024 [26:30:58<20:35:17, 166.93s/it]INFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:55:57 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 581/1024 [26:33:44<20:31:53, 166.85s/it] + {'loss': 0.0866, 'grad_norm': 0.0024386425502598286, 'learning_rate': 1e-05, 'num_tokens': 512054655.0, 'completions/mean_length': 6702.3828125, 'completions/min_length': 1169.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6470.0244140625, 'completions/min_terminated_length': 1169.0, 'completions/max_terminated_length': 16077.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.26645052433013916, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018986206501722336, 'sampling/sampling_logp_difference/max': 6.486593246459961, 'sampling/importance_sampling_ratio/min': 0.0015237311599776149, 'sampling/importance_sampling_ratio/mean': 1.0000202655792236, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8600481152534485, 'clip_ratio/low_mean': 4.171912905803765e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.427778835884965e-06, 'clip_ratio/high_max': 1.371111534353986e-05, 'clip_ratio/region_mean': 4.514690772339236e-05, 'epoch': 0.53} + + 57%|█████▋ | 581/1024 [26:33:44<20:31:53, 166.85s/it]INFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 15:58:44 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 582/1024 [26:36:06<19:33:56, 159.36s/it] + {'loss': 0.0617, 'grad_norm': 0.0072782449424266815, 'learning_rate': 1e-05, 'num_tokens': 512696537.0, 'completions/mean_length': 4845.953125, 'completions/min_length': 160.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 4755.1025390625, 'completions/min_terminated_length': 160.0, 'completions/max_terminated_length': 13410.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.29036980867385864, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01862735114991665, 'sampling/sampling_logp_difference/max': 4.027317047119141, 'sampling/importance_sampling_ratio/min': 0.017822081223130226, 'sampling/importance_sampling_ratio/mean': 0.9999409317970276, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9067303538322449, 'clip_ratio/low_mean': 2.6773893978315755e-05, 'clip_ratio/low_min': 4.736104074254399e-06, 'clip_ratio/high_mean': 4.2680171645770315e-06, 'clip_ratio/high_max': 9.279537152906414e-06, 'clip_ratio/region_mean': 3.1041911142892786e-05, 'epoch': 0.54} + + 57%|█████▋ | 582/1024 [26:36:06<19:33:56, 159.36s/it]INFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:01:06 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 583/1024 [26:38:33<19:03:22, 155.56s/it] + {'loss': 0.0799, 'grad_norm': 0.005057654343545437, 'learning_rate': 1e-05, 'num_tokens': 513505135.0, 'completions/mean_length': 6173.171875, 'completions/min_length': 756.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6011.095703125, 'completions/min_terminated_length': 756.0, 'completions/max_terminated_length': 16282.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.2767051160335541, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.020879898220300674, 'sampling/sampling_logp_difference/max': 8.342979431152344, 'sampling/importance_sampling_ratio/min': 0.0002380619989708066, 'sampling/importance_sampling_ratio/mean': 0.9999635219573975, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9604142308235168, 'clip_ratio/low_mean': 4.360654588708712e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.104518898704555e-06, 'clip_ratio/high_max': 8.41807559481822e-06, 'clip_ratio/region_mean': 4.5711064331044327e-05, 'epoch': 0.54} + + 57%|█████▋ | 583/1024 [26:38:33<19:03:22, 155.56s/it]INFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:03:33 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 584/1024 [26:40:59<18:39:54, 152.71s/it] + {'loss': 0.0991, 'grad_norm': 0.0047672707587480545, 'learning_rate': 1e-05, 'num_tokens': 514232058.0, 'completions/mean_length': 5546.5234375, 'completions/min_length': 1113.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5374.50048828125, 'completions/min_terminated_length': 1113.0, 'completions/max_terminated_length': 15173.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.27038949728012085, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018185433000326157, 'sampling/sampling_logp_difference/max': 9.74951171875, 'sampling/importance_sampling_ratio/min': 5.8323133998783305e-05, 'sampling/importance_sampling_ratio/mean': 0.9999624490737915, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8015405982732773, 'clip_ratio/low_mean': 4.2579683963595016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.227510205761064e-06, 'clip_ratio/high_max': 7.327939783863258e-06, 'clip_ratio/region_mean': 4.580719428304292e-05, 'epoch': 0.54} + + 57%|█████▋ | 584/1024 [26:40:59<18:39:54, 152.71s/it]INFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:05:59 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 585/1024 [26:43:37<18:49:10, 154.33s/it] + {'loss': 0.0453, 'grad_norm': 0.005850035231560469, 'learning_rate': 1e-05, 'num_tokens': 515103184.0, 'completions/mean_length': 6637.359375, 'completions/min_length': 1144.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6482.6513671875, 'completions/min_terminated_length': 1144.0, 'completions/max_terminated_length': 15778.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.24988999962806702, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020641878247261047, 'sampling/sampling_logp_difference/max': 15.747965812683105, 'sampling/importance_sampling_ratio/min': 1.4479226706498594e-07, 'sampling/importance_sampling_ratio/mean': 0.999963104724884, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0173144191503525, 'clip_ratio/low_mean': 5.04182496570138e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.388961428958282e-06, 'clip_ratio/high_max': 1.3804907666781219e-05, 'clip_ratio/region_mean': 5.480721097228525e-05, 'epoch': 0.54} + + 57%|█████▋ | 585/1024 [26:43:37<18:49:10, 154.33s/it]INFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:08:37 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 586/1024 [26:46:16<18:55:55, 155.61s/it] + {'loss': 0.0831, 'grad_norm': 0.0037875184789299965, 'learning_rate': 1e-05, 'num_tokens': 516009791.0, 'completions/mean_length': 6940.6171875, 'completions/min_length': 1273.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6866.259765625, 'completions/min_terminated_length': 1273.0, 'completions/max_terminated_length': 15716.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.27222442626953125, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018527517095208168, 'sampling/sampling_logp_difference/max': 12.062490463256836, 'sampling/importance_sampling_ratio/min': 5.772008080384694e-06, 'sampling/importance_sampling_ratio/mean': 0.9999997615814209, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8547529205679893, 'clip_ratio/low_mean': 5.566071547491447e-05, 'clip_ratio/low_min': 8.978264304460026e-06, 'clip_ratio/high_mean': 3.986071760664345e-06, 'clip_ratio/high_max': 1.594428704265738e-05, 'clip_ratio/region_mean': 5.964678746295249e-05, 'epoch': 0.54} + + 57%|█████▋ | 586/1024 [26:46:16<18:55:55, 155.61s/it]INFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:11:15 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 587/1024 [26:48:54<18:59:05, 156.40s/it] + {'loss': 0.0502, 'grad_norm': 0.0015506440540775657, 'learning_rate': 1e-05, 'num_tokens': 516903335.0, 'completions/mean_length': 6837.125, 'completions/min_length': 1319.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6761.95263671875, 'completions/min_terminated_length': 1319.0, 'completions/max_terminated_length': 15387.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.20593318343162537, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.020130250602960587, 'sampling/sampling_logp_difference/max': 10.0628080368042, 'sampling/importance_sampling_ratio/min': 4.2636147554730996e-05, 'sampling/importance_sampling_ratio/mean': 0.9999232292175293, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9027494043111801, 'clip_ratio/low_mean': 3.340147941344185e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.731095721879683e-06, 'clip_ratio/high_max': 6.924382887518732e-06, 'clip_ratio/region_mean': 3.5132575476382044e-05, 'epoch': 0.54} + + 57%|█████▋ | 587/1024 [26:48:54<18:59:05, 156.40s/it]INFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:13:54 [block_pool.py:292] Successfully reset prefix cache + + 57%|█████▋ | 588/1024 [26:51:53<19:44:51, 163.05s/it] + {'loss': 0.1165, 'grad_norm': 0.003520917845889926, 'learning_rate': 1e-05, 'num_tokens': 517929081.0, 'completions/mean_length': 7866.703125, 'completions/min_length': 49.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7222.5380859375, 'completions/min_terminated_length': 49.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.3316730856895447, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01890747994184494, 'sampling/sampling_logp_difference/max': 9.684585571289062, 'sampling/importance_sampling_ratio/min': 6.223546370165423e-05, 'sampling/importance_sampling_ratio/mean': 0.9999421834945679, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8133657574653625, 'clip_ratio/low_mean': 3.885528553837503e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1935539368532773e-06, 'clip_ratio/high_max': 1.2774215747413109e-05, 'clip_ratio/region_mean': 4.204883930469805e-05, 'epoch': 0.54} + + 57%|█████▋ | 588/1024 [26:51:53<19:44:51, 163.05s/it]INFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:16:52 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 589/1024 [26:54:28<19:26:20, 160.87s/it] + {'loss': 0.0447, 'grad_norm': 0.0029796145390719175, 'learning_rate': 1e-05, 'num_tokens': 518810247.0, 'completions/mean_length': 6701.296875, 'completions/min_length': 24.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6547.603515625, 'completions/min_terminated_length': 24.0, 'completions/max_terminated_length': 15944.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2869499921798706, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01977725327014923, 'sampling/sampling_logp_difference/max': 22.101436614990234, 'sampling/importance_sampling_ratio/min': 2.520391673144218e-10, 'sampling/importance_sampling_ratio/mean': 0.9999505877494812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9360691756010056, 'clip_ratio/low_mean': 3.457626269209868e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7355519048578572e-06, 'clip_ratio/high_max': 6.942207619431429e-06, 'clip_ratio/region_mean': 3.631181459695654e-05, 'epoch': 0.54} + + 58%|█████▊ | 589/1024 [26:54:28<19:26:20, 160.87s/it]INFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:19:28 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 590/1024 [26:57:27<20:02:48, 166.29s/it] + {'loss': 0.0477, 'grad_norm': 0.0024249793495982885, 'learning_rate': 1e-05, 'num_tokens': 519730577.0, 'completions/mean_length': 7029.453125, 'completions/min_length': 1180.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6804.9443359375, 'completions/min_terminated_length': 1180.0, 'completions/max_terminated_length': 15971.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.22803518176078796, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01923082396388054, 'sampling/sampling_logp_difference/max': 15.630853652954102, 'sampling/importance_sampling_ratio/min': 1.6278204384434503e-07, 'sampling/importance_sampling_ratio/mean': 0.9999786615371704, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9168537557125092, 'clip_ratio/low_mean': 3.738725240509666e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.476589184487239e-07, 'clip_ratio/high_max': 3.7906356737948954e-06, 'clip_ratio/region_mean': 3.8334911323545384e-05, 'epoch': 0.54} + + 58%|█████▊ | 590/1024 [26:57:27<20:02:48, 166.29s/it]INFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:22:27 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 591/1024 [27:00:27<20:29:00, 170.30s/it] + {'loss': 0.0821, 'grad_norm': 0.003160425927489996, 'learning_rate': 1e-05, 'num_tokens': 520680707.0, 'completions/mean_length': 7255.453125, 'completions/min_length': 832.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6646.8837890625, 'completions/min_terminated_length': 832.0, 'completions/max_terminated_length': 15600.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2461756467819214, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019255205988883972, 'sampling/sampling_logp_difference/max': 6.968714237213135, 'sampling/importance_sampling_ratio/min': 0.0009408618789166212, 'sampling/importance_sampling_ratio/mean': 1.0000334978103638, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8241118341684341, 'clip_ratio/low_mean': 3.2254738812298456e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.1899421552880085e-06, 'clip_ratio/high_max': 2.4759768621152034e-05, 'clip_ratio/region_mean': 3.8444680967586464e-05, 'epoch': 0.54} + + 58%|█████▊ | 591/1024 [27:00:27<20:29:00, 170.30s/it]INFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:25:27 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 592/1024 [27:03:11<20:11:47, 168.30s/it] + {'loss': 0.0267, 'grad_norm': 0.00411194609478116, 'learning_rate': 1e-05, 'num_tokens': 521703303.0, 'completions/mean_length': 7819.96875, 'completions/min_length': 512.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7752.53564453125, 'completions/min_terminated_length': 512.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.2265625, 'rewards/accuracy_reward/std': 0.4202519655227661, 'reward': 0.2265625, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.022727783769369125, 'sampling/sampling_logp_difference/max': 7.937360763549805, 'sampling/importance_sampling_ratio/min': 0.0003571478300727904, 'sampling/importance_sampling_ratio/mean': 0.9999041557312012, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.1218742430210114, 'clip_ratio/low_mean': 3.9836502310208743e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.864952139385423e-06, 'clip_ratio/high_max': 7.459808557541692e-06, 'clip_ratio/region_mean': 4.170145416537707e-05, 'epoch': 0.54} + + 58%|█████▊ | 592/1024 [27:03:11<20:11:47, 168.30s/it]INFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:28:10 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 593/1024 [27:05:48<19:44:37, 164.91s/it] + {'loss': 0.0339, 'grad_norm': 0.0022753921803086996, 'learning_rate': 1e-05, 'num_tokens': 522531422.0, 'completions/mean_length': 6322.8671875, 'completions/min_length': 637.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6163.1669921875, 'completions/min_terminated_length': 637.0, 'completions/max_terminated_length': 16117.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.20753081142902374, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01893780007958412, 'sampling/sampling_logp_difference/max': 12.124995231628418, 'sampling/importance_sampling_ratio/min': 5.422274170996388e-06, 'sampling/importance_sampling_ratio/mean': 0.9998952150344849, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8323960080742836, 'clip_ratio/low_mean': 3.738353416338214e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.714662395599589e-06, 'clip_ratio/high_max': 1.8858649582398357e-05, 'clip_ratio/region_mean': 4.2098196558981726e-05, 'epoch': 0.55} + + 58%|█████▊ | 593/1024 [27:05:48<19:44:37, 164.91s/it]INFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:30:47 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 594/1024 [27:08:45<20:09:31, 168.77s/it] + {'loss': -0.0134, 'grad_norm': 0.004338000901043415, 'learning_rate': 1e-05, 'num_tokens': 523453262.0, 'completions/mean_length': 7054.0625, 'completions/min_length': 101.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6905.96875, 'completions/min_terminated_length': 101.0, 'completions/max_terminated_length': 16055.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.13204573094844818, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.01982954889535904, 'sampling/sampling_logp_difference/max': 9.437154769897461, 'sampling/importance_sampling_ratio/min': 7.97068714746274e-05, 'sampling/importance_sampling_ratio/mean': 0.9998721480369568, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.866028867661953, 'clip_ratio/low_mean': 1.1187657776190463e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.943995564754005e-07, 'clip_ratio/high_max': 3.977598225901602e-06, 'clip_ratio/region_mean': 1.2182057332665863e-05, 'epoch': 0.55} + + 58%|█████▊ | 594/1024 [27:08:45<20:09:31, 168.77s/it]INFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:33:45 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 595/1024 [27:11:38<20:15:32, 170.00s/it] + {'loss': 0.0648, 'grad_norm': 0.003401415189728141, 'learning_rate': 1e-05, 'num_tokens': 524436831.0, 'completions/mean_length': 7539.0703125, 'completions/min_length': 446.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7027.3798828125, 'completions/min_terminated_length': 446.0, 'completions/max_terminated_length': 16361.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2511882185935974, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019884679466485977, 'sampling/sampling_logp_difference/max': 10.775017738342285, 'sampling/importance_sampling_ratio/min': 2.0915547793265432e-05, 'sampling/importance_sampling_ratio/mean': 0.999969482421875, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8601142391562462, 'clip_ratio/low_mean': 3.533169467573316e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.7596285551444453e-06, 'clip_ratio/high_max': 1.5038514220577781e-05, 'clip_ratio/region_mean': 3.9091323742468376e-05, 'epoch': 0.55} + + 58%|█████▊ | 595/1024 [27:11:38<20:15:32, 170.00s/it]INFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:36:38 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 596/1024 [27:14:31<20:19:20, 170.93s/it] + {'loss': 0.0549, 'grad_norm': 0.002879115054383874, 'learning_rate': 1e-05, 'num_tokens': 525368091.0, 'completions/mean_length': 7137.96875, 'completions/min_length': 606.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6762.11376953125, 'completions/min_terminated_length': 606.0, 'completions/max_terminated_length': 16343.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.27062684297561646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01847894862294197, 'sampling/sampling_logp_difference/max': 7.680283546447754, 'sampling/importance_sampling_ratio/min': 0.0004618439415935427, 'sampling/importance_sampling_ratio/mean': 1.0000025033950806, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7909424379467964, 'clip_ratio/low_mean': 5.44505830930575e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.187421713046206e-06, 'clip_ratio/high_max': 2.9679867111553904e-05, 'clip_ratio/region_mean': 6.263800514716422e-05, 'epoch': 0.55} + + 58%|█████▊ | 596/1024 [27:14:31<20:19:20, 170.93s/it]INFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:39:31 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 597/1024 [27:17:02<19:32:19, 164.73s/it] + {'loss': 0.0549, 'grad_norm': 0.004545152187347412, 'learning_rate': 1e-05, 'num_tokens': 526095378.0, 'completions/mean_length': 5486.3671875, 'completions/min_length': 4.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5224.82421875, 'completions/min_terminated_length': 4.0, 'completions/max_terminated_length': 16208.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.33508801460266113, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02017204463481903, 'sampling/sampling_logp_difference/max': 9.675474166870117, 'sampling/importance_sampling_ratio/min': 6.280510569922626e-05, 'sampling/importance_sampling_ratio/mean': 0.9998891353607178, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9588652476668358, 'clip_ratio/low_mean': 3.1269102407804894e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4413541293833987e-06, 'clip_ratio/high_max': 5.765416517533595e-06, 'clip_ratio/region_mean': 3.2710456423501455e-05, 'epoch': 0.55} + + 58%|█████▊ | 597/1024 [27:17:02<19:32:19, 164.73s/it]INFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:42:01 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 598/1024 [27:19:36<19:06:47, 161.52s/it] + {'loss': 0.0477, 'grad_norm': 0.004040954168885946, 'learning_rate': 1e-05, 'num_tokens': 526969459.0, 'completions/mean_length': 6636.0078125, 'completions/min_length': 685.0, 'completions/max_length': 16169.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6636.0078125, 'completions/min_terminated_length': 685.0, 'completions/max_terminated_length': 16169.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.3158818483352661, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02086419239640236, 'sampling/sampling_logp_difference/max': 17.61687469482422, 'sampling/importance_sampling_ratio/min': 2.2340275407373156e-08, 'sampling/importance_sampling_ratio/mean': 0.9999474287033081, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9497648254036903, 'clip_ratio/low_mean': 4.477498589494644e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.732241109195456e-06, 'clip_ratio/high_max': 1.519483475931338e-05, 'clip_ratio/region_mean': 4.950722734520241e-05, 'epoch': 0.55} + + 58%|█████▊ | 598/1024 [27:19:36<19:06:47, 161.52s/it]INFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:44:35 [block_pool.py:292] Successfully reset prefix cache + + 58%|█████▊ | 599/1024 [27:22:12<18:53:23, 160.01s/it] + {'loss': 0.1854, 'grad_norm': 0.004678349941968918, 'learning_rate': 1e-05, 'num_tokens': 527822197.0, 'completions/mean_length': 6462.953125, 'completions/min_length': 824.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6142.9189453125, 'completions/min_terminated_length': 824.0, 'completions/max_terminated_length': 15820.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.3345640003681183, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019832316786050797, 'sampling/sampling_logp_difference/max': 10.463495254516602, 'sampling/importance_sampling_ratio/min': 2.8560234568431042e-05, 'sampling/importance_sampling_ratio/mean': 0.9997877478599548, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9401230812072754, 'clip_ratio/low_mean': 4.7215530003086315e-05, 'clip_ratio/low_min': 5.274039267533226e-06, 'clip_ratio/high_mean': 3.946291258216661e-06, 'clip_ratio/high_max': 1.5785165032866644e-05, 'clip_ratio/region_mean': 5.116182205711084e-05, 'epoch': 0.55} + + 58%|█████▊ | 599/1024 [27:22:12<18:53:23, 160.01s/it]INFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:47:12 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▊ | 600/1024 [27:24:53<18:52:54, 160.32s/it] + {'loss': 0.0668, 'grad_norm': 0.0014094997895881534, 'learning_rate': 1e-05, 'num_tokens': 528759458.0, 'completions/mean_length': 7172.1015625, 'completions/min_length': 1079.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6951.01611328125, 'completions/min_terminated_length': 1079.0, 'completions/max_terminated_length': 15170.0, 'rewards/accuracy_reward/mean': 0.3515625, 'rewards/accuracy_reward/std': 0.4793342351913452, 'reward': 0.3515625, 'reward_std': 0.16834919154644012, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.018519852310419083, 'sampling/sampling_logp_difference/max': 6.621304035186768, 'sampling/importance_sampling_ratio/min': 0.001331693259999156, 'sampling/importance_sampling_ratio/mean': 0.9999281167984009, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7962061613798141, 'clip_ratio/low_mean': 4.795687004843785e-05, 'clip_ratio/low_min': 7.76807610236574e-06, 'clip_ratio/high_mean': 1.0353853667766089e-06, 'clip_ratio/high_max': 4.1415414671064354e-06, 'clip_ratio/region_mean': 4.899225518784078e-05, 'epoch': 0.55} + + 59%|█████▊ | 600/1024 [27:24:53<18:52:54, 160.32s/it]INFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:49:53 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▊ | 601/1024 [27:27:37<18:57:19, 161.32s/it] + {'loss': 0.0561, 'grad_norm': 0.0038943374529480934, 'learning_rate': 1e-05, 'num_tokens': 529626893.0, 'completions/mean_length': 6612.6484375, 'completions/min_length': 480.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6378.13623046875, 'completions/min_terminated_length': 480.0, 'completions/max_terminated_length': 16195.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.23934084177017212, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018441151827573776, 'sampling/sampling_logp_difference/max': 6.01370906829834, 'sampling/importance_sampling_ratio/min': 0.0024450027849525213, 'sampling/importance_sampling_ratio/mean': 0.9999620914459229, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8218385726213455, 'clip_ratio/low_mean': 5.2064756346226204e-05, 'clip_ratio/low_min': 5.341652013157727e-06, 'clip_ratio/high_mean': 3.018199095095042e-06, 'clip_ratio/high_max': 7.3846517807396594e-06, 'clip_ratio/region_mean': 5.5082955441321246e-05, 'epoch': 0.55} + + 59%|█████▊ | 601/1024 [27:27:37<18:57:19, 161.32s/it]INFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:52:36 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 602/1024 [27:30:25<19:08:59, 163.36s/it] + {'loss': 0.0577, 'grad_norm': 0.0027088895440101624, 'learning_rate': 1e-05, 'num_tokens': 530486578.0, 'completions/mean_length': 6574.9140625, 'completions/min_length': 371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6419.21484375, 'completions/min_terminated_length': 371.0, 'completions/max_terminated_length': 15898.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.26143792271614075, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.020115964114665985, 'sampling/sampling_logp_difference/max': 11.352873802185059, 'sampling/importance_sampling_ratio/min': 1.1735714906535577e-05, 'sampling/importance_sampling_ratio/mean': 1.000026822090149, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9268836230039597, 'clip_ratio/low_mean': 4.8717710285473004e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0524913679764722e-06, 'clip_ratio/high_max': 8.209965471905889e-06, 'clip_ratio/region_mean': 5.077020244925734e-05, 'epoch': 0.55} + + 59%|█████▉ | 602/1024 [27:30:25<19:08:59, 163.36s/it]INFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:55:25 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 603/1024 [27:32:53<18:33:54, 158.75s/it] + {'loss': 0.0461, 'grad_norm': 0.002628365531563759, 'learning_rate': 1e-05, 'num_tokens': 531303083.0, 'completions/mean_length': 6209.1953125, 'completions/min_length': 598.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6129.07861328125, 'completions/min_terminated_length': 598.0, 'completions/max_terminated_length': 14361.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.13098490238189697, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.019658785313367844, 'sampling/sampling_logp_difference/max': 10.461148262023926, 'sampling/importance_sampling_ratio/min': 2.862734254449606e-05, 'sampling/importance_sampling_ratio/mean': 0.9998608827590942, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9574517607688904, 'clip_ratio/low_mean': 1.3909025255998131e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.31241858980502e-06, 'clip_ratio/high_max': 5.24967435922008e-06, 'clip_ratio/region_mean': 1.5221443845803151e-05, 'epoch': 0.55} + + 59%|█████▉ | 603/1024 [27:32:53<18:33:54, 158.75s/it]INFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 16:57:53 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 604/1024 [27:35:45<18:59:29, 162.78s/it] + {'loss': 0.0285, 'grad_norm': 0.004664157051593065, 'learning_rate': 1e-05, 'num_tokens': 532228227.0, 'completions/mean_length': 7079.1875, 'completions/min_length': 1015.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6855.87255859375, 'completions/min_terminated_length': 1015.0, 'completions/max_terminated_length': 13873.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.30327796936035156, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018260695040225983, 'sampling/sampling_logp_difference/max': 14.43586540222168, 'sampling/importance_sampling_ratio/min': 5.377535785555665e-07, 'sampling/importance_sampling_ratio/mean': 0.9999879598617554, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.853938102722168, 'clip_ratio/low_mean': 4.9158792762682424e-05, 'clip_ratio/low_min': 4.514427928370424e-06, 'clip_ratio/high_mean': 4.753649363919976e-06, 'clip_ratio/high_max': 1.9014597455679905e-05, 'clip_ratio/region_mean': 5.39124412171077e-05, 'epoch': 0.56} + + 59%|█████▉ | 604/1024 [27:35:45<18:59:29, 162.78s/it]INFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:00:45 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 605/1024 [27:38:02<18:02:44, 155.05s/it] + {'loss': 0.0168, 'grad_norm': 0.004579839296638966, 'learning_rate': 1e-05, 'num_tokens': 533024264.0, 'completions/mean_length': 6071.5390625, 'completions/min_length': 742.0, 'completions/max_length': 15094.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6071.5390625, 'completions/min_terminated_length': 742.0, 'completions/max_terminated_length': 15094.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.30327799916267395, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01974770799279213, 'sampling/sampling_logp_difference/max': 7.989465236663818, 'sampling/importance_sampling_ratio/min': 0.0003390153287909925, 'sampling/importance_sampling_ratio/mean': 0.999982476234436, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.980722151696682, 'clip_ratio/low_mean': 2.1738228269896354e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.563708891211718e-06, 'clip_ratio/high_max': 3.025483556484687e-05, 'clip_ratio/region_mean': 2.9301936820047558e-05, 'epoch': 0.56} + + 59%|█████▉ | 605/1024 [27:38:02<18:02:44, 155.05s/it]INFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:03:02 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 606/1024 [27:40:58<18:44:16, 161.38s/it] + {'loss': 0.0576, 'grad_norm': 0.002537919208407402, 'learning_rate': 1e-05, 'num_tokens': 533985318.0, 'completions/mean_length': 7352.484375, 'completions/min_length': 1310.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7209.12744140625, 'completions/min_terminated_length': 1310.0, 'completions/max_terminated_length': 16086.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2580229938030243, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018647275865077972, 'sampling/sampling_logp_difference/max': 6.329618453979492, 'sampling/importance_sampling_ratio/min': 0.0017827138071879745, 'sampling/importance_sampling_ratio/mean': 0.9999037981033325, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7858814746141434, 'clip_ratio/low_mean': 5.142044130934664e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.773990667672479e-06, 'clip_ratio/high_max': 1.3344870239961892e-05, 'clip_ratio/region_mean': 5.6194432318079635e-05, 'epoch': 0.56} + + 59%|█████▉ | 606/1024 [27:40:58<18:44:16, 161.38s/it]INFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:05:58 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 607/1024 [27:44:01<19:26:07, 167.79s/it] + {'loss': 0.0648, 'grad_norm': 0.0037982286885380745, 'learning_rate': 1e-05, 'num_tokens': 534912558.0, 'completions/mean_length': 7095.1875, 'completions/min_length': 1073.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6947.74658203125, 'completions/min_terminated_length': 1073.0, 'completions/max_terminated_length': 16082.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01693977229297161, 'sampling/sampling_logp_difference/max': 9.422355651855469, 'sampling/importance_sampling_ratio/min': 8.089523180387914e-05, 'sampling/importance_sampling_ratio/mean': 0.9999147057533264, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6846291124820709, 'clip_ratio/low_mean': 4.466222731025482e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.0977013137962786e-06, 'clip_ratio/high_max': 2.345925531699322e-05, 'clip_ratio/region_mean': 5.175992941985896e-05, 'epoch': 0.56} + + 59%|█████▉ | 607/1024 [27:44:01<19:26:07, 167.79s/it]INFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:09:01 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 608/1024 [27:46:49<19:22:58, 167.74s/it] + {'loss': 0.0716, 'grad_norm': 0.0030545955523848534, 'learning_rate': 1e-05, 'num_tokens': 535707127.0, 'completions/mean_length': 6038.1953125, 'completions/min_length': 677.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5873.9765625, 'completions/min_terminated_length': 677.0, 'completions/max_terminated_length': 15572.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3243142366409302, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018705151975154877, 'sampling/sampling_logp_difference/max': 8.624987602233887, 'sampling/importance_sampling_ratio/min': 0.00017956242663785815, 'sampling/importance_sampling_ratio/mean': 0.9999387264251709, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8637901693582535, 'clip_ratio/low_mean': 6.557838094067847e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2919629170937696e-06, 'clip_ratio/high_max': 5.167851668375079e-06, 'clip_ratio/region_mean': 6.687034363039857e-05, 'epoch': 0.56} + + 59%|█████▉ | 608/1024 [27:46:49<19:22:58, 167.74s/it]INFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:11:48 [block_pool.py:292] Successfully reset prefix cache + + 59%|█████▉ | 609/1024 [27:54:58<19:15:18, 167.03s/it] + {'loss': 0.0698, 'grad_norm': 0.002951717935502529, 'learning_rate': 1e-05, 'num_tokens': 536618376.0, 'completions/mean_length': 6978.0078125, 'completions/min_length': 69.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6828.70654296875, 'completions/min_terminated_length': 69.0, 'completions/max_terminated_length': 14906.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3527044355869293, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018486514687538147, 'sampling/sampling_logp_difference/max': 10.160879135131836, 'sampling/importance_sampling_ratio/min': 3.865327380481176e-05, 'sampling/importance_sampling_ratio/mean': 0.9999598264694214, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7931060045957565, 'clip_ratio/low_mean': 5.012885230826214e-05, 'clip_ratio/low_min': 3.5653165468829684e-06, 'clip_ratio/high_mean': 5.544901910070621e-06, 'clip_ratio/high_max': 1.7691760149318725e-05, 'clip_ratio/region_mean': 5.5673754559393274e-05, 'epoch': 0.56} + + 59%|█████▉ | 609/1024 [27:54:58<19:15:18, 167.03s/it]INFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:19:58 [block_pool.py:292] Successfully reset prefix cache + + 60%|█████▉ | 610/1024 [27:57:42<30:03:20, 261.35s/it] + {'loss': 0.0973, 'grad_norm': 0.0019385438645258546, 'learning_rate': 1e-05, 'num_tokens': 537513876.0, 'completions/mean_length': 6810.15625, 'completions/min_length': 477.0, 'completions/max_length': 15329.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6810.15625, 'completions/min_terminated_length': 477.0, 'completions/max_terminated_length': 15329.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.28011518716812134, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02021351456642151, 'sampling/sampling_logp_difference/max': 9.934880256652832, 'sampling/importance_sampling_ratio/min': 4.845474904868752e-05, 'sampling/importance_sampling_ratio/mean': 1.000025749206543, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8957240954041481, 'clip_ratio/low_mean': 6.101864732954709e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.397787731453718e-06, 'clip_ratio/high_max': 2.1591150925814873e-05, 'clip_ratio/region_mean': 6.6416435629435e-05, 'epoch': 0.56} + + 60%|█████▉ | 610/1024 [27:57:42<30:03:20, 261.35s/it]INFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:22:42 [block_pool.py:292] Successfully reset prefix cache + + 60%|█████▉ | 611/1024 [28:00:36<27:13:03, 237.25s/it] + {'loss': 0.0319, 'grad_norm': 0.001886329147964716, 'learning_rate': 1e-05, 'num_tokens': 538419265.0, 'completions/mean_length': 6940.4140625, 'completions/min_length': 370.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6713.7685546875, 'completions/min_terminated_length': 370.0, 'completions/max_terminated_length': 16065.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.19568344950675964, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019072774797677994, 'sampling/sampling_logp_difference/max': 14.18748950958252, 'sampling/importance_sampling_ratio/min': 6.893687327647058e-07, 'sampling/importance_sampling_ratio/mean': 1.0000052452087402, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8646975234150887, 'clip_ratio/low_mean': 1.2616926369446446e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.914362077419355e-06, 'clip_ratio/high_max': 1.4817902865615906e-05, 'clip_ratio/region_mean': 1.8531288333178964e-05, 'epoch': 0.56} + + 60%|█████▉ | 611/1024 [28:00:36<27:13:03, 237.25s/it]INFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:25:36 [block_pool.py:292] Successfully reset prefix cache + + 60%|█████▉ | 612/1024 [28:03:47<25:33:16, 223.29s/it] + {'loss': 0.0335, 'grad_norm': 0.002031022449955344, 'learning_rate': 1e-05, 'num_tokens': 539399127.0, 'completions/mean_length': 7508.796875, 'completions/min_length': 607.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6995.35498046875, 'completions/min_terminated_length': 607.0, 'completions/max_terminated_length': 15960.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.2301519513130188, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01832709088921547, 'sampling/sampling_logp_difference/max': 5.177490234375, 'sampling/importance_sampling_ratio/min': 0.0056421491317451, 'sampling/importance_sampling_ratio/mean': 0.9999816417694092, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7723299860954285, 'clip_ratio/low_mean': 3.254086982451554e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5920325040497119e-06, 'clip_ratio/high_max': 6.3681300161988474e-06, 'clip_ratio/region_mean': 3.4132902555938927e-05, 'epoch': 0.56} + + 60%|█████▉ | 612/1024 [28:03:47<25:33:16, 223.29s/it]INFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:28:47 [block_pool.py:292] Successfully reset prefix cache + + 60%|█████▉ | 613/1024 [28:06:24<23:12:15, 203.25s/it] + {'loss': 0.1072, 'grad_norm': 0.003653773572295904, 'learning_rate': 1e-05, 'num_tokens': 540189602.0, 'completions/mean_length': 6019.6484375, 'completions/min_length': 1020.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5938.03955078125, 'completions/min_terminated_length': 1020.0, 'completions/max_terminated_length': 15816.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.26143303513526917, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017161473631858826, 'sampling/sampling_logp_difference/max': 5.242223262786865, 'sampling/importance_sampling_ratio/min': 0.005288486368954182, 'sampling/importance_sampling_ratio/mean': 0.9999122619628906, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7425512671470642, 'clip_ratio/low_mean': 2.6742804038804024e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.9622444774067844e-06, 'clip_ratio/high_max': 1.5848977909627138e-05, 'clip_ratio/region_mean': 3.070504851621081e-05, 'epoch': 0.56} + + 60%|█████▉ | 613/1024 [28:06:24<23:12:15, 203.25s/it]INFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:31:23 [block_pool.py:292] Successfully reset prefix cache + + 60%|█████▉ | 614/1024 [28:09:22<22:18:39, 195.90s/it] + {'loss': 0.0346, 'grad_norm': 0.003739065257832408, 'learning_rate': 1e-05, 'num_tokens': 541125587.0, 'completions/mean_length': 7155.6953125, 'completions/min_length': 987.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6621.826171875, 'completions/min_terminated_length': 987.0, 'completions/max_terminated_length': 15861.0, 'rewards/accuracy_reward/mean': 0.265625, 'rewards/accuracy_reward/std': 0.44340085983276367, 'reward': 0.265625, 'reward_std': 0.2522490322589874, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02008877694606781, 'sampling/sampling_logp_difference/max': 11.59233570098877, 'sampling/importance_sampling_ratio/min': 9.236609002982732e-06, 'sampling/importance_sampling_ratio/mean': 0.9999271631240845, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9789249897003174, 'clip_ratio/low_mean': 3.428678644468164e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.754297725005017e-06, 'clip_ratio/high_max': 1.1017190900020069e-05, 'clip_ratio/region_mean': 3.7041084169686656e-05, 'epoch': 0.56} + + 60%|█████▉ | 614/1024 [28:09:22<22:18:39, 195.90s/it]INFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:34:22 [block_pool.py:292] Successfully reset prefix cache + + 60%|██████ | 615/1024 [28:12:21<21:40:21, 190.76s/it] + {'loss': 0.0524, 'grad_norm': 0.0020656392443925142, 'learning_rate': 1e-05, 'num_tokens': 542173801.0, 'completions/mean_length': 8027.359375, 'completions/min_length': 248.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7470.25048828125, 'completions/min_terminated_length': 248.0, 'completions/max_terminated_length': 13553.0, 'rewards/accuracy_reward/mean': 0.2578125, 'rewards/accuracy_reward/std': 0.43914902210235596, 'reward': 0.2578125, 'reward_std': 0.22225633263587952, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.021495234221220016, 'sampling/sampling_logp_difference/max': 8.124446868896484, 'sampling/importance_sampling_ratio/min': 0.00029620854184031487, 'sampling/importance_sampling_ratio/mean': 0.999947190284729, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9153474718332291, 'clip_ratio/low_mean': 4.249646542575647e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.4549021873099264e-06, 'clip_ratio/high_max': 5.6091539590852335e-06, 'clip_ratio/region_mean': 4.4951367613066395e-05, 'epoch': 0.57} + + 60%|██████ | 615/1024 [28:12:21<21:40:21, 190.76s/it]INFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:37:21 [block_pool.py:292] Successfully reset prefix cache + + 60%|██████ | 616/1024 [28:14:56<20:23:49, 179.97s/it] + {'loss': 0.0648, 'grad_norm': 0.00824788399040699, 'learning_rate': 1e-05, 'num_tokens': 542977266.0, 'completions/mean_length': 6115.3828125, 'completions/min_length': 1158.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5952.38916015625, 'completions/min_terminated_length': 1158.0, 'completions/max_terminated_length': 15879.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.30616888403892517, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017732972279191017, 'sampling/sampling_logp_difference/max': 6.622807502746582, 'sampling/importance_sampling_ratio/min': 0.0013296925462782383, 'sampling/importance_sampling_ratio/mean': 0.9999478459358215, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.751783661544323, 'clip_ratio/low_mean': 5.2193488272678223e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.325646500547009e-06, 'clip_ratio/high_max': 1.7302586002188036e-05, 'clip_ratio/region_mean': 5.6519134659538395e-05, 'epoch': 0.57} + + 60%|██████ | 616/1024 [28:14:56<20:23:49, 179.97s/it]INFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:39:56 [block_pool.py:292] Successfully reset prefix cache + + 60%|██████ | 617/1024 [28:17:53<20:15:23, 179.17s/it] + {'loss': 0.0613, 'grad_norm': 0.005189655348658562, 'learning_rate': 1e-05, 'num_tokens': 543947515.0, 'completions/mean_length': 7431.3203125, 'completions/min_length': 738.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7142.52392578125, 'completions/min_terminated_length': 738.0, 'completions/max_terminated_length': 15688.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.21595832705497742, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.02111673541367054, 'sampling/sampling_logp_difference/max': 8.644620895385742, 'sampling/importance_sampling_ratio/min': 0.00017607140762265772, 'sampling/importance_sampling_ratio/mean': 0.9999845623970032, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9122852608561516, 'clip_ratio/low_mean': 5.301810256241879e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.185486876755022e-06, 'clip_ratio/high_max': 2.872588265745435e-05, 'clip_ratio/region_mean': 6.120358921180014e-05, 'epoch': 0.57} + + 60%|██████ | 617/1024 [28:17:53<20:15:23, 179.17s/it]INFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:42:53 [block_pool.py:292] Successfully reset prefix cache + + 60%|██████ | 618/1024 [28:20:33<19:33:45, 173.46s/it] + {'loss': 0.0773, 'grad_norm': 0.004707770887762308, 'learning_rate': 1e-05, 'num_tokens': 544694826.0, 'completions/mean_length': 5700.5546875, 'completions/min_length': 727.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5530.9765625, 'completions/min_terminated_length': 727.0, 'completions/max_terminated_length': 16378.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3366856575012207, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018697837367653847, 'sampling/sampling_logp_difference/max': 21.374990463256836, 'sampling/importance_sampling_ratio/min': 5.211461817644647e-10, 'sampling/importance_sampling_ratio/mean': 0.9998490214347839, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8961661159992218, 'clip_ratio/low_mean': 3.414959587644262e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 9.961746627595858e-07, 'clip_ratio/high_max': 3.984698651038343e-06, 'clip_ratio/region_mean': 3.514577088026272e-05, 'epoch': 0.57} + + 60%|██████ | 618/1024 [28:20:33<19:33:45, 173.46s/it]INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:45:33 [block_pool.py:292] Successfully reset prefix cache + + 60%|██████ | 619/1024 [28:22:21<17:17:44, 153.74s/it] + {'loss': 0.0492, 'grad_norm': 0.00980924628674984, 'learning_rate': 1e-05, 'num_tokens': 545255377.0, 'completions/mean_length': 4201.6796875, 'completions/min_length': 436.0, 'completions/max_length': 12422.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 4201.6796875, 'completions/min_terminated_length': 436.0, 'completions/max_terminated_length': 12422.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.38664889335632324, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016301468014717102, 'sampling/sampling_logp_difference/max': 9.455235481262207, 'sampling/importance_sampling_ratio/min': 7.827866647858173e-05, 'sampling/importance_sampling_ratio/mean': 1.000074028968811, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7066933363676071, 'clip_ratio/low_mean': 5.229935004535946e-05, 'clip_ratio/low_min': 4.098226327187149e-06, 'clip_ratio/high_mean': 2.9524303499783855e-06, 'clip_ratio/high_max': 1.1809721399913542e-05, 'clip_ratio/region_mean': 5.525178062271152e-05, 'epoch': 0.57} + + 60%|██████ | 619/1024 [28:22:21<17:17:44, 153.74s/it]INFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:47:21 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 620/1024 [28:24:50<17:06:05, 152.39s/it] + {'loss': 0.077, 'grad_norm': 0.005619424395263195, 'learning_rate': 1e-05, 'num_tokens': 546013882.0, 'completions/mean_length': 5782.2578125, 'completions/min_length': 434.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5613.9765625, 'completions/min_terminated_length': 434.0, 'completions/max_terminated_length': 13234.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2472364753484726, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018704919144511223, 'sampling/sampling_logp_difference/max': 9.267168045043945, 'sampling/importance_sampling_ratio/min': 9.447568299947307e-05, 'sampling/importance_sampling_ratio/mean': 1.0000319480895996, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.846621498465538, 'clip_ratio/low_mean': 1.853809601470857e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5232756140903803e-06, 'clip_ratio/high_max': 6.093102456361521e-06, 'clip_ratio/region_mean': 2.0061371856172627e-05, 'epoch': 0.57} + + 61%|██████ | 620/1024 [28:24:50<17:06:05, 152.39s/it]INFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:49:50 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 621/1024 [28:27:24<17:06:56, 152.90s/it] + {'loss': 0.0964, 'grad_norm': 0.0063271005637943745, 'learning_rate': 1e-05, 'num_tokens': 546954857.0, 'completions/mean_length': 7191.4921875, 'completions/min_length': 1379.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7045.57958984375, 'completions/min_terminated_length': 1379.0, 'completions/max_terminated_length': 15569.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01846012845635414, 'sampling/sampling_logp_difference/max': 5.062449932098389, 'sampling/importance_sampling_ratio/min': 0.006330032367259264, 'sampling/importance_sampling_ratio/mean': 0.9999164342880249, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7846563309431076, 'clip_ratio/low_mean': 4.008232758678787e-05, 'clip_ratio/low_min': 3.511630438879365e-06, 'clip_ratio/high_mean': 4.186933551864058e-06, 'clip_ratio/high_max': 1.6747734207456233e-05, 'clip_ratio/region_mean': 4.426926193445979e-05, 'epoch': 0.57} + + 61%|██████ | 621/1024 [28:27:24<17:06:56, 152.90s/it]INFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:52:24 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 622/1024 [28:29:32<16:13:51, 145.35s/it] + {'loss': 0.1013, 'grad_norm': 0.005836677737534046, 'learning_rate': 1e-05, 'num_tokens': 547676024.0, 'completions/mean_length': 5491.7421875, 'completions/min_length': 1644.0, 'completions/max_length': 15529.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5491.7421875, 'completions/min_terminated_length': 1644.0, 'completions/max_terminated_length': 15529.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.43213340640068054, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.016565188765525818, 'sampling/sampling_logp_difference/max': 7.7476348876953125, 'sampling/importance_sampling_ratio/min': 0.00043176248436793685, 'sampling/importance_sampling_ratio/mean': 0.999930739402771, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6960643380880356, 'clip_ratio/low_mean': 5.253966105556174e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2486661603361426e-05, 'clip_ratio/high_max': 3.451678094279487e-05, 'clip_ratio/region_mean': 6.502632390947838e-05, 'epoch': 0.57} + + 61%|██████ | 622/1024 [28:29:32<16:13:51, 145.35s/it]INFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:54:32 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 623/1024 [28:32:28<17:11:38, 154.36s/it] + {'loss': 0.0326, 'grad_norm': 0.00226933928206563, 'learning_rate': 1e-05, 'num_tokens': 548590080.0, 'completions/mean_length': 6993.125, 'completions/min_length': 980.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6844.06396484375, 'completions/min_terminated_length': 980.0, 'completions/max_terminated_length': 16179.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.19332444667816162, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01880657486617565, 'sampling/sampling_logp_difference/max': 13.68293285369873, 'sampling/importance_sampling_ratio/min': 1.1417677114877733e-06, 'sampling/importance_sampling_ratio/mean': 1.000011682510376, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8031502217054367, 'clip_ratio/low_mean': 3.0399249226320535e-05, 'clip_ratio/low_min': 5.838393462909153e-06, 'clip_ratio/high_mean': 1.079745743481908e-06, 'clip_ratio/high_max': 4.318982973927632e-06, 'clip_ratio/region_mean': 3.147899496980244e-05, 'epoch': 0.57} + + 61%|██████ | 623/1024 [28:32:28<17:11:38, 154.36s/it]INFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:57:27 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 624/1024 [28:34:45<16:34:24, 149.16s/it] + {'loss': 0.0394, 'grad_norm': 0.005067484453320503, 'learning_rate': 1e-05, 'num_tokens': 549327251.0, 'completions/mean_length': 5602.8359375, 'completions/min_length': 100.0, 'completions/max_length': 15278.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5602.8359375, 'completions/min_terminated_length': 100.0, 'completions/max_terminated_length': 15278.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.35218530893325806, 'frac_reward_zero_std': 0.1875, 'sampling/sampling_logp_difference/mean': 0.018545404076576233, 'sampling/sampling_logp_difference/max': 5.624884605407715, 'sampling/importance_sampling_ratio/min': 0.0036069792695343494, 'sampling/importance_sampling_ratio/mean': 0.9999701380729675, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8287182524800301, 'clip_ratio/low_mean': 4.231840989632474e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.101052132275072e-06, 'clip_ratio/high_max': 8.404208529100288e-06, 'clip_ratio/region_mean': 4.441946202859981e-05, 'epoch': 0.57} + + 61%|██████ | 624/1024 [28:34:45<16:34:24, 149.16s/it]INFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:59:44 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 625/1024 [28:37:29<17:01:25, 153.60s/it] + {'loss': 0.0064, 'grad_norm': 0.0023132911883294582, 'learning_rate': 1e-05, 'num_tokens': 550208750.0, 'completions/mean_length': 6747.0234375, 'completions/min_length': 879.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6671.1416015625, 'completions/min_terminated_length': 879.0, 'completions/max_terminated_length': 15901.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.30904704332351685, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019216621294617653, 'sampling/sampling_logp_difference/max': 5.592033386230469, 'sampling/importance_sampling_ratio/min': 0.003727440955117345, 'sampling/importance_sampling_ratio/mean': 0.9999475479125977, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8722762316465378, 'clip_ratio/low_mean': 4.6288066641864134e-05, 'clip_ratio/low_min': 5.32640206074575e-06, 'clip_ratio/high_mean': 1.8743556893241475e-06, 'clip_ratio/high_max': 7.49742275729659e-06, 'clip_ratio/region_mean': 4.816242244487512e-05, 'epoch': 0.57} + + 61%|██████ | 625/1024 [28:37:29<17:01:25, 153.60s/it]INFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:02:28 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 626/1024 [28:40:10<17:13:31, 155.81s/it] + {'loss': 0.0905, 'grad_norm': 0.0036700034979730844, 'learning_rate': 1e-05, 'num_tokens': 551123002.0, 'completions/mean_length': 6983.40625, 'completions/min_length': 385.0, 'completions/max_length': 16027.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6983.40625, 'completions/min_terminated_length': 385.0, 'completions/max_terminated_length': 16027.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2419992983341217, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019318291917443275, 'sampling/sampling_logp_difference/max': 9.8963041305542, 'sampling/importance_sampling_ratio/min': 5.0360464229015633e-05, 'sampling/importance_sampling_ratio/mean': 0.9999868273735046, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8781512826681137, 'clip_ratio/low_mean': 6.517495285152108e-05, 'clip_ratio/low_min': 1.1217302017030306e-05, 'clip_ratio/high_mean': 1.923391891978099e-06, 'clip_ratio/high_max': 7.693567567912396e-06, 'clip_ratio/region_mean': 6.709834497087286e-05, 'epoch': 0.58} + + 61%|██████ | 626/1024 [28:40:10<17:13:31, 155.81s/it]INFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:05:09 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████ | 627/1024 [28:43:06<17:52:14, 162.05s/it] + {'loss': 0.0268, 'grad_norm': 0.0036717690527439117, 'learning_rate': 1e-05, 'num_tokens': 552055472.0, 'completions/mean_length': 7143.671875, 'completions/min_length': 451.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6689.22900390625, 'completions/min_terminated_length': 451.0, 'completions/max_terminated_length': 16201.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.2212003767490387, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018518533557653427, 'sampling/sampling_logp_difference/max': 9.0, 'sampling/importance_sampling_ratio/min': 0.00012340980174485594, 'sampling/importance_sampling_ratio/mean': 0.9998798966407776, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7715872526168823, 'clip_ratio/low_mean': 5.9073974398415885e-05, 'clip_ratio/low_min': 6.781316187698394e-06, 'clip_ratio/high_mean': 1.2745738331432221e-06, 'clip_ratio/high_max': 5.098295332572889e-06, 'clip_ratio/region_mean': 6.034854845893278e-05, 'epoch': 0.58} + + 61%|██████ | 627/1024 [28:43:06<17:52:14, 162.05s/it]INFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:08:06 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████▏ | 628/1024 [28:46:00<18:13:29, 165.68s/it] + {'loss': 0.0494, 'grad_norm': 0.0019187588477507234, 'learning_rate': 1e-05, 'num_tokens': 552914275.0, 'completions/mean_length': 6558.5859375, 'completions/min_length': 1061.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6075.36865234375, 'completions/min_terminated_length': 1061.0, 'completions/max_terminated_length': 15729.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2041158676147461, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01948089525103569, 'sampling/sampling_logp_difference/max': 9.07090950012207, 'sampling/importance_sampling_ratio/min': 0.00011496193474158645, 'sampling/importance_sampling_ratio/mean': 0.9999418258666992, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9016438648104668, 'clip_ratio/low_mean': 2.460010267668622e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.4468678197517875e-06, 'clip_ratio/high_max': 1.778747127900715e-05, 'clip_ratio/region_mean': 2.9046970439594588e-05, 'epoch': 0.58} + + 61%|██████▏ | 628/1024 [28:46:00<18:13:29, 165.68s/it]INFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:11:00 [block_pool.py:292] Successfully reset prefix cache + + 61%|██████▏ | 629/1024 [28:48:35<17:49:37, 162.48s/it] + {'loss': 0.0907, 'grad_norm': 0.003598993644118309, 'learning_rate': 1e-05, 'num_tokens': 553719958.0, 'completions/mean_length': 6150.2734375, 'completions/min_length': 596.0, 'completions/max_length': 15812.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6150.2734375, 'completions/min_terminated_length': 596.0, 'completions/max_terminated_length': 15812.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.3022220730781555, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019557828083634377, 'sampling/sampling_logp_difference/max': 7.093727111816406, 'sampling/importance_sampling_ratio/min': 0.000830297009088099, 'sampling/importance_sampling_ratio/mean': 0.9999948740005493, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8385711833834648, 'clip_ratio/low_mean': 4.3287541757308645e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.4582062653498724e-06, 'clip_ratio/high_max': 1.383282506139949e-05, 'clip_ratio/region_mean': 4.674574802265852e-05, 'epoch': 0.58} + + 61%|██████▏ | 629/1024 [28:48:35<17:49:37, 162.48s/it]INFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:13:35 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 630/1024 [28:51:46<18:43:12, 171.05s/it] + {'loss': 0.0261, 'grad_norm': 0.002453390508890152, 'learning_rate': 1e-05, 'num_tokens': 554784458.0, 'completions/mean_length': 8142.46875, 'completions/min_length': 1828.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7519.16015625, 'completions/min_terminated_length': 1828.0, 'completions/max_terminated_length': 16354.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.1422954648733139, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.019445519894361496, 'sampling/sampling_logp_difference/max': 8.498891830444336, 'sampling/importance_sampling_ratio/min': 0.0002036939695244655, 'sampling/importance_sampling_ratio/mean': 0.9999715089797974, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8508284538984299, 'clip_ratio/low_mean': 1.7461135655594262e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.672075301139557e-07, 'clip_ratio/high_max': 2.668830120455823e-06, 'clip_ratio/region_mean': 1.8128343185708218e-05, 'epoch': 0.58} + + 62%|██████▏ | 630/1024 [28:51:46<18:43:12, 171.05s/it]INFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:16:46 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 631/1024 [28:54:36<18:37:35, 170.62s/it] + {'loss': 0.0245, 'grad_norm': 0.0027936683036386967, 'learning_rate': 1e-05, 'num_tokens': 555783296.0, 'completions/mean_length': 7665.921875, 'completions/min_length': 791.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7384.693359375, 'completions/min_terminated_length': 791.0, 'completions/max_terminated_length': 16109.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.24435830116271973, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01912892609834671, 'sampling/sampling_logp_difference/max': 8.187341690063477, 'sampling/importance_sampling_ratio/min': 0.0002781523216981441, 'sampling/importance_sampling_ratio/mean': 0.9998488426208496, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7667205557227135, 'clip_ratio/low_mean': 3.1556500402984966e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.750615062221186e-06, 'clip_ratio/high_max': 1.9002460248884745e-05, 'clip_ratio/region_mean': 3.630711614732718e-05, 'epoch': 0.58} + + 62%|██████▏ | 631/1024 [28:54:36<18:37:35, 170.62s/it]INFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:19:36 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 632/1024 [28:57:25<18:30:49, 170.03s/it] + {'loss': 0.1028, 'grad_norm': 0.004213637672364712, 'learning_rate': 1e-05, 'num_tokens': 556732942.0, 'completions/mean_length': 7266.171875, 'completions/min_length': 1117.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6972.04833984375, 'completions/min_terminated_length': 1117.0, 'completions/max_terminated_length': 16379.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.3135277032852173, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01689826510846615, 'sampling/sampling_logp_difference/max': 13.249999046325684, 'sampling/importance_sampling_ratio/min': 1.760348027346481e-06, 'sampling/importance_sampling_ratio/mean': 0.9999159574508667, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7114122956991196, 'clip_ratio/low_mean': 3.8605214058407e-05, 'clip_ratio/low_min': 6.2870940382708795e-06, 'clip_ratio/high_mean': 3.8924990235500445e-06, 'clip_ratio/high_max': 1.5569996094200178e-05, 'clip_ratio/region_mean': 4.249771222930576e-05, 'epoch': 0.58} + + 62%|██████▏ | 632/1024 [28:57:25<18:30:49, 170.03s/it]INFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:22:24 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 633/1024 [29:00:12<18:23:09, 169.28s/it] + {'loss': 0.0406, 'grad_norm': 0.004169877618551254, 'learning_rate': 1e-05, 'num_tokens': 557589141.0, 'completions/mean_length': 6532.9921875, 'completions/min_length': 757.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6296.568359375, 'completions/min_terminated_length': 757.0, 'completions/max_terminated_length': 16054.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.2675113081932068, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018738210201263428, 'sampling/sampling_logp_difference/max': 12.311498641967773, 'sampling/importance_sampling_ratio/min': 4.499705482885474e-06, 'sampling/importance_sampling_ratio/mean': 0.9999022483825684, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7711968123912811, 'clip_ratio/low_mean': 3.640393322257296e-05, 'clip_ratio/low_min': 3.0146634344419e-06, 'clip_ratio/high_mean': 5.434466118003911e-06, 'clip_ratio/high_max': 2.1737864472015644e-05, 'clip_ratio/region_mean': 4.183839985216764e-05, 'epoch': 0.58} + + 62%|██████▏ | 633/1024 [29:00:12<18:23:09, 169.28s/it]INFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:25:12 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 634/1024 [29:03:07<18:31:43, 171.03s/it] + {'loss': 0.0565, 'grad_norm': 0.0032470994628965855, 'learning_rate': 1e-05, 'num_tokens': 558557286.0, 'completions/mean_length': 7384.3203125, 'completions/min_length': 87.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7168.328125, 'completions/min_terminated_length': 87.0, 'completions/max_terminated_length': 16337.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019018521532416344, 'sampling/sampling_logp_difference/max': 8.535643577575684, 'sampling/importance_sampling_ratio/min': 0.00019634375348687172, 'sampling/importance_sampling_ratio/mean': 0.9999680519104004, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8054972141981125, 'clip_ratio/low_mean': 6.070675681257853e-05, 'clip_ratio/low_min': 5.175126261747209e-06, 'clip_ratio/high_mean': 1.5248809290824283e-06, 'clip_ratio/high_max': 6.099523716329713e-06, 'clip_ratio/region_mean': 6.223163745744387e-05, 'epoch': 0.58} + + 62%|██████▏ | 634/1024 [29:03:07<18:31:43, 171.03s/it]INFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:28:07 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 635/1024 [29:05:40<17:53:48, 165.63s/it] + {'loss': 0.1247, 'grad_norm': 0.004848263692110777, 'learning_rate': 1e-05, 'num_tokens': 559364639.0, 'completions/mean_length': 6131.9453125, 'completions/min_length': 820.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6051.22021484375, 'completions/min_terminated_length': 820.0, 'completions/max_terminated_length': 15918.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.27328526973724365, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018360167741775513, 'sampling/sampling_logp_difference/max': 12.124655723571777, 'sampling/importance_sampling_ratio/min': 5.424115443020128e-06, 'sampling/importance_sampling_ratio/mean': 1.000056266784668, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8365718051791191, 'clip_ratio/low_mean': 3.798940008437057e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1092134911905305e-05, 'clip_ratio/high_max': 4.436853964762122e-05, 'clip_ratio/region_mean': 4.908153437099827e-05, 'epoch': 0.58} + + 62%|██████▏ | 635/1024 [29:05:40<17:53:48, 165.63s/it]INFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:30:40 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 636/1024 [29:08:24<17:48:09, 165.18s/it] + {'loss': 0.0279, 'grad_norm': 0.003403177484869957, 'learning_rate': 1e-05, 'num_tokens': 560119248.0, 'completions/mean_length': 5746.8828125, 'completions/min_length': 131.0, 'completions/max_length': 15724.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5746.8828125, 'completions/min_terminated_length': 131.0, 'completions/max_terminated_length': 15724.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.015006184577941895, 'sampling/sampling_logp_difference/max': 14.25, 'sampling/importance_sampling_ratio/min': 6.475952432083432e-07, 'sampling/importance_sampling_ratio/mean': 0.9999486207962036, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6247628927230835, 'clip_ratio/low_mean': 2.7543567512111622e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.849658353123232e-06, 'clip_ratio/high_max': 1.9398633412492927e-05, 'clip_ratio/region_mean': 3.239322609260853e-05, 'epoch': 0.59} + + 62%|██████▏ | 636/1024 [29:08:24<17:48:09, 165.18s/it]INFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:33:24 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 637/1024 [29:11:12<17:49:07, 165.76s/it] + {'loss': 0.093, 'grad_norm': 0.004058506805449724, 'learning_rate': 1e-05, 'num_tokens': 561072493.0, 'completions/mean_length': 7313.7890625, 'completions/min_length': 1068.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7096.1044921875, 'completions/min_terminated_length': 1068.0, 'completions/max_terminated_length': 16209.0, 'rewards/accuracy_reward/mean': 0.375, 'rewards/accuracy_reward/std': 0.4860251843929291, 'reward': 0.375, 'reward_std': 0.3079911172389984, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01940958946943283, 'sampling/sampling_logp_difference/max': 7.320003509521484, 'sampling/importance_sampling_ratio/min': 0.0006621598731726408, 'sampling/importance_sampling_ratio/mean': 0.9999264478683472, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8606570512056351, 'clip_ratio/low_mean': 4.927243321617425e-05, 'clip_ratio/low_min': 5.929088274569949e-06, 'clip_ratio/high_mean': 8.111364707019675e-06, 'clip_ratio/high_max': 2.857848289750109e-05, 'clip_ratio/region_mean': 5.738379809372418e-05, 'epoch': 0.59} + + 62%|██████▏ | 637/1024 [29:11:12<17:49:07, 165.76s/it]INFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:36:11 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 638/1024 [29:14:01<17:52:57, 166.78s/it] + {'loss': 0.098, 'grad_norm': 0.002768489997833967, 'learning_rate': 1e-05, 'num_tokens': 562048734.0, 'completions/mean_length': 7495.5078125, 'completions/min_length': 882.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7425.51953125, 'completions/min_terminated_length': 882.0, 'completions/max_terminated_length': 16093.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.344813734292984, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.0189508069306612, 'sampling/sampling_logp_difference/max': 11.133618354797363, 'sampling/importance_sampling_ratio/min': 1.4612716768169776e-05, 'sampling/importance_sampling_ratio/mean': 0.9999319314956665, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8225502669811249, 'clip_ratio/low_mean': 4.890350828645751e-05, 'clip_ratio/low_min': 3.968002147303196e-06, 'clip_ratio/high_mean': 7.758043807370996e-06, 'clip_ratio/high_max': 2.7213282010052353e-05, 'clip_ratio/region_mean': 5.666155129802064e-05, 'epoch': 0.59} + + 62%|██████▏ | 638/1024 [29:14:01<17:52:57, 166.78s/it]INFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:39:00 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▏ | 639/1024 [29:16:56<18:06:14, 169.28s/it] + {'loss': 0.0507, 'grad_norm': 0.002966079628095031, 'learning_rate': 1e-05, 'num_tokens': 562945623.0, 'completions/mean_length': 6856.5703125, 'completions/min_length': 173.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6627.912109375, 'completions/min_terminated_length': 173.0, 'completions/max_terminated_length': 15894.0, 'rewards/accuracy_reward/mean': 0.40625, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.40625, 'reward_std': 0.3016803562641144, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019664689898490906, 'sampling/sampling_logp_difference/max': 8.624966621398926, 'sampling/importance_sampling_ratio/min': 0.0001795661955839023, 'sampling/importance_sampling_ratio/mean': 0.9998261332511902, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8542520478367805, 'clip_ratio/low_mean': 4.9131452101391915e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.311648519385926e-06, 'clip_ratio/high_max': 2.5246594077543705e-05, 'clip_ratio/region_mean': 5.544310107552519e-05, 'epoch': 0.59} + + 62%|██████▏ | 639/1024 [29:16:56<18:06:14, 169.28s/it]INFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:41:55 [block_pool.py:292] Successfully reset prefix cache + + 62%|██████▎ | 640/1024 [29:19:39<17:52:00, 167.50s/it] + {'loss': 0.0164, 'grad_norm': 0.0021058651618659496, 'learning_rate': 1e-05, 'num_tokens': 563789214.0, 'completions/mean_length': 6463.2421875, 'completions/min_length': 812.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6305.77001953125, 'completions/min_terminated_length': 812.0, 'completions/max_terminated_length': 15231.0, 'rewards/accuracy_reward/mean': 0.3046875, 'rewards/accuracy_reward/std': 0.46208351850509644, 'reward': 0.3046875, 'reward_std': 0.24541424214839935, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01898353546857834, 'sampling/sampling_logp_difference/max': 7.749993324279785, 'sampling/importance_sampling_ratio/min': 0.00043074542190879583, 'sampling/importance_sampling_ratio/mean': 0.9998518824577332, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8427078947424889, 'clip_ratio/low_mean': 4.154238490627904e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.316983106240514e-06, 'clip_ratio/high_max': 1.2127683930884814e-05, 'clip_ratio/region_mean': 4.685936778514588e-05, 'epoch': 0.59} + + 62%|██████▎ | 640/1024 [29:19:39<17:52:00, 167.50s/it]INFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:44:41 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 63%|██████▎ | 641/1024 [29:22:20<17:36:48, 165.56s/it] + {'loss': 0.0816, 'grad_norm': 0.005890186410397291, 'learning_rate': 1e-05, 'num_tokens': 564596185.0, 'completions/mean_length': 6140.7734375, 'completions/min_length': 780.0, 'completions/max_length': 15232.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6140.7734375, 'completions/min_terminated_length': 780.0, 'completions/max_terminated_length': 15232.0, 'rewards/accuracy_reward/mean': 0.4765625, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.4765625, 'reward_std': 0.23486016690731049, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01930009014904499, 'sampling/sampling_logp_difference/max': 7.120187759399414, 'sampling/importance_sampling_ratio/min': 0.000808614946436137, 'sampling/importance_sampling_ratio/mean': 0.9998830556869507, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8800382614135742, 'clip_ratio/low_mean': 3.146892504446441e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1398174655805633e-06, 'clip_ratio/high_max': 1.2559269862322253e-05, 'clip_ratio/region_mean': 3.4608742623731814e-05, 'epoch': 0.59} + + 63%|██████▎ | 641/1024 [29:22:20<17:36:48, 165.56s/it]INFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:47:20 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 642/1024 [29:25:07<17:36:17, 165.91s/it] + {'loss': -0.0094, 'grad_norm': 0.003226465079933405, 'learning_rate': 1e-05, 'num_tokens': 565430387.0, 'completions/mean_length': 6361.703125, 'completions/min_length': 510.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6202.61962890625, 'completions/min_terminated_length': 510.0, 'completions/max_terminated_length': 16246.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.2682726979255676, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019014433026313782, 'sampling/sampling_logp_difference/max': 5.405893802642822, 'sampling/importance_sampling_ratio/min': 0.004490039311349392, 'sampling/importance_sampling_ratio/mean': 0.9999127984046936, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8246701806783676, 'clip_ratio/low_mean': 4.3151162458343606e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2748337212542538e-06, 'clip_ratio/high_max': 5.099334885017015e-06, 'clip_ratio/region_mean': 4.442599617959786e-05, 'epoch': 0.59} + + 63%|██████▎ | 642/1024 [29:25:07<17:36:17, 165.91s/it]INFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:50:07 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 643/1024 [29:28:15<18:15:46, 172.56s/it] + {'loss': 0.0597, 'grad_norm': 0.003077681176364422, 'learning_rate': 1e-05, 'num_tokens': 566393214.0, 'completions/mean_length': 7363.5234375, 'completions/min_length': 706.0, 'completions/max_length': 16283.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 7363.5234375, 'completions/min_terminated_length': 706.0, 'completions/max_terminated_length': 16283.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.24830512702465057, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01871068961918354, 'sampling/sampling_logp_difference/max': 14.924853324890137, 'sampling/importance_sampling_ratio/min': 3.297756165920873e-07, 'sampling/importance_sampling_ratio/mean': 1.000014066696167, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.828450471162796, 'clip_ratio/low_mean': 3.808748408573592e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.822751001640427e-06, 'clip_ratio/high_max': 2.8547008014356834e-05, 'clip_ratio/region_mean': 4.591023491684609e-05, 'epoch': 0.59} + + 63%|██████▎ | 643/1024 [29:28:15<18:15:46, 172.56s/it]INFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:53:15 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 644/1024 [29:31:06<18:10:46, 172.23s/it] + {'loss': -0.0326, 'grad_norm': 0.0023631115909665823, 'learning_rate': 1e-05, 'num_tokens': 567294697.0, 'completions/mean_length': 6883.8984375, 'completions/min_length': 830.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6809.09423828125, 'completions/min_terminated_length': 830.0, 'completions/max_terminated_length': 16016.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.22567616403102875, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02030845358967781, 'sampling/sampling_logp_difference/max': 5.291841983795166, 'sampling/importance_sampling_ratio/min': 0.005032482091337442, 'sampling/importance_sampling_ratio/mean': 0.9999625086784363, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9114723727107048, 'clip_ratio/low_mean': 1.9775024611590197e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.2140636727053788e-06, 'clip_ratio/high_max': 4.856254690821515e-06, 'clip_ratio/region_mean': 2.098908817060874e-05, 'epoch': 0.59} + + 63%|██████▎ | 644/1024 [29:31:06<18:10:46, 172.23s/it]INFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:56:06 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 645/1024 [29:33:47<17:44:59, 168.60s/it] + {'loss': 0.071, 'grad_norm': 0.006442595738917589, 'learning_rate': 1e-05, 'num_tokens': 568210240.0, 'completions/mean_length': 6996.9296875, 'completions/min_length': 1477.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6923.015625, 'completions/min_terminated_length': 1477.0, 'completions/max_terminated_length': 16376.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.3061561584472656, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.018702290952205658, 'sampling/sampling_logp_difference/max': 6.779873847961426, 'sampling/importance_sampling_ratio/min': 0.0011364181991666555, 'sampling/importance_sampling_ratio/mean': 0.9999593496322632, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7864109799265862, 'clip_ratio/low_mean': 3.9204465110742603e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.152158688455529e-06, 'clip_ratio/high_max': 4.608634753822116e-06, 'clip_ratio/region_mean': 4.035662391288497e-05, 'epoch': 0.59} + + 63%|██████▎ | 645/1024 [29:33:47<17:44:59, 168.60s/it]INFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 18:58:46 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 646/1024 [29:36:25<17:22:03, 165.40s/it] + {'loss': 0.0495, 'grad_norm': 0.004090449772775173, 'learning_rate': 1e-05, 'num_tokens': 569046727.0, 'completions/mean_length': 6384.5546875, 'completions/min_length': 878.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6305.81884765625, 'completions/min_terminated_length': 878.0, 'completions/max_terminated_length': 16367.0, 'rewards/accuracy_reward/mean': 0.546875, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.546875, 'reward_std': 0.3266732692718506, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.017125204205513, 'sampling/sampling_logp_difference/max': 7.8639349937438965, 'sampling/importance_sampling_ratio/min': 0.00038435845635831356, 'sampling/importance_sampling_ratio/mean': 0.9999207854270935, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7353173196315765, 'clip_ratio/low_mean': 5.24772226526693e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.011521352571435e-06, 'clip_ratio/high_max': 1.442532902728999e-05, 'clip_ratio/region_mean': 5.748874355049338e-05, 'epoch': 0.59} + + 63%|██████▎ | 646/1024 [29:36:25<17:22:03, 165.40s/it]INFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:01:24 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 647/1024 [29:39:09<17:17:17, 165.09s/it] + {'loss': 0.0076, 'grad_norm': 0.0030447279568761587, 'learning_rate': 1e-05, 'num_tokens': 569975323.0, 'completions/mean_length': 7074.59375, 'completions/min_length': 623.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6696.29248046875, 'completions/min_terminated_length': 623.0, 'completions/max_terminated_length': 16258.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.17176413536071777, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019303584471344948, 'sampling/sampling_logp_difference/max': 3.709077835083008, 'sampling/importance_sampling_ratio/min': 0.024500105530023575, 'sampling/importance_sampling_ratio/mean': 0.9999834299087524, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9198992624878883, 'clip_ratio/low_mean': 3.2856025427463464e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.0019189125596313e-06, 'clip_ratio/high_max': 1.2007675650238525e-05, 'clip_ratio/region_mean': 3.585794411264942e-05, 'epoch': 0.6} + + 63%|██████▎ | 647/1024 [29:39:09<17:17:17, 165.09s/it]INFO 12-02 19:04:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:04:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:04:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:04:09 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 648/1024 [29:42:33<18:27:57, 176.80s/it] + {'loss': 0.0678, 'grad_norm': 0.004508152138441801, 'learning_rate': 1e-05, 'num_tokens': 571024900.0, 'completions/mean_length': 8044.2578125, 'completions/min_length': 902.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 7181.52587890625, 'completions/min_terminated_length': 902.0, 'completions/max_terminated_length': 16211.0, 'rewards/accuracy_reward/mean': 0.3203125, 'rewards/accuracy_reward/std': 0.4684300124645233, 'reward': 0.3203125, 'reward_std': 0.26698729395866394, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018804769963026047, 'sampling/sampling_logp_difference/max': 10.130229949951172, 'sampling/importance_sampling_ratio/min': 3.98563061025925e-05, 'sampling/importance_sampling_ratio/mean': 0.9999692440032959, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8030193895101547, 'clip_ratio/low_mean': 7.121561156964162e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5884191952864057e-06, 'clip_ratio/high_max': 6.353676781145623e-06, 'clip_ratio/region_mean': 7.280403042386752e-05, 'epoch': 0.6} + + 63%|██████▎ | 648/1024 [29:42:33<18:27:57, 176.80s/it]INFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:07:33 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 649/1024 [29:45:37<18:39:08, 179.06s/it] + {'loss': 0.0265, 'grad_norm': 0.003926917444914579, 'learning_rate': 1e-05, 'num_tokens': 572125141.0, 'completions/mean_length': 8451.7578125, 'completions/min_length': 813.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7922.94189453125, 'completions/min_terminated_length': 813.0, 'completions/max_terminated_length': 15903.0, 'rewards/accuracy_reward/mean': 0.203125, 'rewards/accuracy_reward/std': 0.40390563011169434, 'reward': 0.203125, 'reward_std': 0.19226360321044922, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.021555956453084946, 'sampling/sampling_logp_difference/max': 16.238862991333008, 'sampling/importance_sampling_ratio/min': 8.862401301712453e-08, 'sampling/importance_sampling_ratio/mean': 0.9999009370803833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.008152723312378, 'clip_ratio/low_mean': 3.612134810282441e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.7038794339896413e-06, 'clip_ratio/high_max': 6.815517735958565e-06, 'clip_ratio/region_mean': 3.7825227536814054e-05, 'epoch': 0.6} + + 63%|██████▎ | 649/1024 [29:45:37<18:39:08, 179.06s/it]INFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:10:37 [block_pool.py:292] Successfully reset prefix cache + + 63%|██████▎ | 650/1024 [29:48:47<18:55:56, 182.24s/it] + {'loss': 0.0367, 'grad_norm': 0.0036475847009569407, 'learning_rate': 1e-05, 'num_tokens': 573041934.0, 'completions/mean_length': 7011.8203125, 'completions/min_length': 728.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 6786.88818359375, 'completions/min_terminated_length': 728.0, 'completions/max_terminated_length': 16120.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.31010788679122925, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.02046291157603264, 'sampling/sampling_logp_difference/max': 10.249990463256836, 'sampling/importance_sampling_ratio/min': 3.535783980623819e-05, 'sampling/importance_sampling_ratio/mean': 0.9999783039093018, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8761812150478363, 'clip_ratio/low_mean': 5.86272076361638e-05, 'clip_ratio/low_min': 1.1987166999460896e-05, 'clip_ratio/high_mean': 3.796089742991171e-06, 'clip_ratio/high_max': 1.5184358971964684e-05, 'clip_ratio/region_mean': 6.242329754968523e-05, 'epoch': 0.6} + + 63%|██████▎ | 650/1024 [29:48:47<18:55:56, 182.24s/it]INFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:13:47 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▎ | 651/1024 [29:51:40<18:36:20, 179.57s/it] + {'loss': 0.0678, 'grad_norm': 0.0038963130209594965, 'learning_rate': 1e-05, 'num_tokens': 574040917.0, 'completions/mean_length': 7665.7421875, 'completions/min_length': 816.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7161.3798828125, 'completions/min_terminated_length': 816.0, 'completions/max_terminated_length': 15510.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3169426918029785, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01943662390112877, 'sampling/sampling_logp_difference/max': 13.374890327453613, 'sampling/importance_sampling_ratio/min': 1.5536705859631184e-06, 'sampling/importance_sampling_ratio/mean': 0.9999545812606812, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7933268994092941, 'clip_ratio/low_mean': 4.855269958170538e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.744779692420707e-06, 'clip_ratio/high_max': 1.0979118769682827e-05, 'clip_ratio/region_mean': 5.129747910359583e-05, 'epoch': 0.6} + + 64%|██████▎ | 651/1024 [29:51:40<18:36:20, 179.57s/it]INFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:16:40 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▎ | 652/1024 [29:54:45<18:42:07, 180.99s/it] + {'loss': 0.0418, 'grad_norm': 0.0030520735308527946, 'learning_rate': 1e-05, 'num_tokens': 575078695.0, 'completions/mean_length': 7966.828125, 'completions/min_length': 553.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7695.30615234375, 'completions/min_terminated_length': 553.0, 'completions/max_terminated_length': 16049.0, 'rewards/accuracy_reward/mean': 0.2734375, 'rewards/accuracy_reward/std': 0.447474867105484, 'reward': 0.2734375, 'reward_std': 0.19332443177700043, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0197810810059309, 'sampling/sampling_logp_difference/max': 7.872013568878174, 'sampling/importance_sampling_ratio/min': 0.00038126588333398104, 'sampling/importance_sampling_ratio/mean': 1.0000214576721191, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8473240435123444, 'clip_ratio/low_mean': 2.4625115656817798e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.893257598974742e-06, 'clip_ratio/high_max': 9.610412234906107e-06, 'clip_ratio/region_mean': 2.8518373483166215e-05, 'epoch': 0.6} + + 64%|██████▎ | 652/1024 [29:54:45<18:42:07, 180.99s/it]INFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:19:44 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 653/1024 [29:57:06<17:26:01, 169.17s/it] + {'loss': 0.1305, 'grad_norm': 0.0029330148827284575, 'learning_rate': 1e-05, 'num_tokens': 575915163.0, 'completions/mean_length': 6384.53125, 'completions/min_length': 1045.0, 'completions/max_length': 15116.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6384.53125, 'completions/min_terminated_length': 1045.0, 'completions/max_terminated_length': 15116.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2885475754737854, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019899431616067886, 'sampling/sampling_logp_difference/max': 8.872506141662598, 'sampling/importance_sampling_ratio/min': 0.0001401908230036497, 'sampling/importance_sampling_ratio/mean': 0.9999364614486694, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9130589440464973, 'clip_ratio/low_mean': 3.762348410418781e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0246395049762214e-05, 'clip_ratio/high_max': 4.0985580199048854e-05, 'clip_ratio/region_mean': 4.7869878471829e-05, 'epoch': 0.6} + + 64%|██████▍ | 653/1024 [29:57:06<17:26:01, 169.17s/it]INFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:22:06 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 654/1024 [29:59:37<16:50:07, 163.80s/it] + {'loss': 0.0328, 'grad_norm': 0.0037648119032382965, 'learning_rate': 1e-05, 'num_tokens': 576895261.0, 'completions/mean_length': 7484.140625, 'completions/min_length': 745.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7414.06298828125, 'completions/min_terminated_length': 745.0, 'completions/max_terminated_length': 14716.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2987973093986511, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.020455794408917427, 'sampling/sampling_logp_difference/max': 8.220190048217773, 'sampling/importance_sampling_ratio/min': 0.0002691639238037169, 'sampling/importance_sampling_ratio/mean': 0.9999864101409912, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8762720301747322, 'clip_ratio/low_mean': 4.3348386952857254e-05, 'clip_ratio/low_min': 3.435481630731374e-06, 'clip_ratio/high_mean': 1.2012300203423365e-06, 'clip_ratio/high_max': 4.804920081369346e-06, 'clip_ratio/region_mean': 4.454961697319959e-05, 'epoch': 0.6} + + 64%|██████▍ | 654/1024 [29:59:37<16:50:07, 163.80s/it]INFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:24:37 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 655/1024 [30:02:29<17:00:50, 165.99s/it] + {'loss': 0.0874, 'grad_norm': 0.0022230292670428753, 'learning_rate': 1e-05, 'num_tokens': 577874916.0, 'completions/mean_length': 7483.8671875, 'completions/min_length': 447.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7196.76611328125, 'completions/min_terminated_length': 447.0, 'completions/max_terminated_length': 15614.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.3322049677371979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019235530868172646, 'sampling/sampling_logp_difference/max': 6.195826530456543, 'sampling/importance_sampling_ratio/min': 0.002037918195128441, 'sampling/importance_sampling_ratio/mean': 1.0000191926956177, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8481424525380135, 'clip_ratio/low_mean': 5.7342298759976984e-05, 'clip_ratio/low_min': 1.5017260921013076e-05, 'clip_ratio/high_mean': 5.822761295348755e-06, 'clip_ratio/high_max': 2.329104518139502e-05, 'clip_ratio/region_mean': 6.316505982795206e-05, 'epoch': 0.6} + + 64%|██████▍ | 655/1024 [30:02:29<17:00:50, 165.99s/it]INFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:27:28 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 656/1024 [30:05:24<17:14:51, 168.73s/it] + {'loss': 0.0249, 'grad_norm': 0.003690029727295041, 'learning_rate': 1e-05, 'num_tokens': 578741608.0, 'completions/mean_length': 6618.34375, 'completions/min_length': 563.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6541.44873046875, 'completions/min_terminated_length': 563.0, 'completions/max_terminated_length': 15621.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.22673210501670837, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019522596150636673, 'sampling/sampling_logp_difference/max': 10.818385124206543, 'sampling/importance_sampling_ratio/min': 2.0027882783324458e-05, 'sampling/importance_sampling_ratio/mean': 0.9998915195465088, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8699518665671349, 'clip_ratio/low_mean': 3.113216860128887e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.0502737925198744e-06, 'clip_ratio/high_max': 8.201095170079498e-06, 'clip_ratio/region_mean': 3.318244205274823e-05, 'epoch': 0.6} + + 64%|██████▍ | 656/1024 [30:05:24<17:14:51, 168.73s/it]INFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:30:23 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 657/1024 [30:08:21<17:28:29, 171.41s/it] + {'loss': 0.0846, 'grad_norm': 0.004026883281767368, 'learning_rate': 1e-05, 'num_tokens': 579617377.0, 'completions/mean_length': 6699.6953125, 'completions/min_length': 693.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6223.41748046875, 'completions/min_terminated_length': 693.0, 'completions/max_terminated_length': 16165.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.3056321144104004, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018191896378993988, 'sampling/sampling_logp_difference/max': 14.687499046325684, 'sampling/importance_sampling_ratio/min': 4.181192991836724e-07, 'sampling/importance_sampling_ratio/mean': 0.9997950792312622, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7825306504964828, 'clip_ratio/low_mean': 5.6235591728182044e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.0405914281363948e-06, 'clip_ratio/high_max': 4.162365712545579e-06, 'clip_ratio/region_mean': 5.7276183270005276e-05, 'epoch': 0.6} + + 64%|██████▍ | 657/1024 [30:08:21<17:28:29, 171.41s/it]INFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:33:21 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 658/1024 [30:10:41<16:27:35, 161.90s/it] + {'loss': 0.0796, 'grad_norm': 0.004194674547761679, 'learning_rate': 1e-05, 'num_tokens': 580402633.0, 'completions/mean_length': 5984.875, 'completions/min_length': 1404.0, 'completions/max_length': 15406.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5984.875, 'completions/min_terminated_length': 1404.0, 'completions/max_terminated_length': 15406.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.3066929578781128, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.019084136933088303, 'sampling/sampling_logp_difference/max': 5.749660015106201, 'sampling/importance_sampling_ratio/min': 0.003183862892910838, 'sampling/importance_sampling_ratio/mean': 0.9999486804008484, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8239431977272034, 'clip_ratio/low_mean': 3.858270270029607e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.379652520787204e-06, 'clip_ratio/high_max': 2.1518610083148815e-05, 'clip_ratio/region_mean': 4.396235544845695e-05, 'epoch': 0.61} + + 64%|██████▍ | 658/1024 [30:10:41<16:27:35, 161.90s/it]INFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:35:41 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 659/1024 [30:13:17<16:14:40, 160.22s/it] + {'loss': -0.001, 'grad_norm': 0.005016419570893049, 'learning_rate': 1e-05, 'num_tokens': 581187586.0, 'completions/mean_length': 5950.5703125, 'completions/min_length': 1140.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5784.96044921875, 'completions/min_terminated_length': 1140.0, 'completions/max_terminated_length': 15690.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.2306838035583496, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01908070594072342, 'sampling/sampling_logp_difference/max': 5.809609413146973, 'sampling/importance_sampling_ratio/min': 0.002998600946739316, 'sampling/importance_sampling_ratio/mean': 0.9999349117279053, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8884857445955276, 'clip_ratio/low_mean': 5.0344978262728546e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.680707826944854e-06, 'clip_ratio/high_max': 2.6722831307779416e-05, 'clip_ratio/region_mean': 5.702568614651682e-05, 'epoch': 0.61} + + 64%|██████▍ | 659/1024 [30:13:17<16:14:40, 160.22s/it]INFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:38:17 [block_pool.py:292] Successfully reset prefix cache + + 64%|██████▍ | 660/1024 [30:16:13<16:39:42, 164.79s/it] + {'loss': 0.0718, 'grad_norm': 0.0035609283950179815, 'learning_rate': 1e-05, 'num_tokens': 582236557.0, 'completions/mean_length': 8029.7734375, 'completions/min_length': 1584.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 7690.17041015625, 'completions/min_terminated_length': 1584.0, 'completions/max_terminated_length': 16101.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2409384548664093, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01982714608311653, 'sampling/sampling_logp_difference/max': 5.2554192543029785, 'sampling/importance_sampling_ratio/min': 0.005219157785177231, 'sampling/importance_sampling_ratio/mean': 0.999931275844574, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.858074463903904, 'clip_ratio/low_mean': 2.6390790822006238e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.1695884697692236e-06, 'clip_ratio/high_max': 8.678353879076894e-06, 'clip_ratio/region_mean': 2.8560379291775462e-05, 'epoch': 0.61} + + 64%|██████▍ | 660/1024 [30:16:13<16:39:42, 164.79s/it]INFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:41:12 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▍ | 661/1024 [30:19:04<16:48:57, 166.77s/it] + {'loss': 0.0505, 'grad_norm': 0.005020176526159048, 'learning_rate': 1e-05, 'num_tokens': 583150740.0, 'completions/mean_length': 6958.4921875, 'completions/min_length': 904.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 6494.9423828125, 'completions/min_terminated_length': 904.0, 'completions/max_terminated_length': 16063.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2109457552433014, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018450919538736343, 'sampling/sampling_logp_difference/max': 3.8077571392059326, 'sampling/importance_sampling_ratio/min': 0.022197909653186798, 'sampling/importance_sampling_ratio/mean': 0.999988853931427, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7957572638988495, 'clip_ratio/low_mean': 3.278200858858327e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.522766622969357e-06, 'clip_ratio/high_max': 2.362454961257754e-05, 'clip_ratio/region_mean': 4.030477487049211e-05, 'epoch': 0.61} + + 65%|██████▍ | 661/1024 [30:19:04<16:48:57, 166.77s/it]INFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:44:04 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▍ | 662/1024 [30:21:38<16:23:04, 162.94s/it] + {'loss': 0.0826, 'grad_norm': 0.005002971272915602, 'learning_rate': 1e-05, 'num_tokens': 584044250.0, 'completions/mean_length': 6810.234375, 'completions/min_length': 1105.0, 'completions/max_length': 15856.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 6810.234375, 'completions/min_terminated_length': 1105.0, 'completions/max_terminated_length': 15856.0, 'rewards/accuracy_reward/mean': 0.5390625, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.5390625, 'reward_std': 0.22225630283355713, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018436448648571968, 'sampling/sampling_logp_difference/max': 10.743270874023438, 'sampling/importance_sampling_ratio/min': 2.1590203687082976e-05, 'sampling/importance_sampling_ratio/mean': 0.9999277591705322, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7868659943342209, 'clip_ratio/low_mean': 4.201903630018933e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.383796982030617e-06, 'clip_ratio/high_max': 9.535187928122468e-06, 'clip_ratio/region_mean': 4.440283305484627e-05, 'epoch': 0.61} + + 65%|██████▍ | 662/1024 [30:21:38<16:23:04, 162.94s/it]INFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:46:38 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▍ | 663/1024 [30:24:36<16:46:14, 167.24s/it] + {'loss': 0.1195, 'grad_norm': 0.0021831525955349207, 'learning_rate': 1e-05, 'num_tokens': 584971568.0, 'completions/mean_length': 7106.296875, 'completions/min_length': 802.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6487.78369140625, 'completions/min_terminated_length': 802.0, 'completions/max_terminated_length': 16291.0, 'rewards/accuracy_reward/mean': 0.5625, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.5625, 'reward_std': 0.32772916555404663, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.018681492656469345, 'sampling/sampling_logp_difference/max': 9.413989067077637, 'sampling/importance_sampling_ratio/min': 8.157488628057763e-05, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8079892098903656, 'clip_ratio/low_mean': 5.7681085309013724e-05, 'clip_ratio/low_min': 4.5418209992931224e-06, 'clip_ratio/high_mean': 9.566726021148497e-06, 'clip_ratio/high_max': 3.5268151805212256e-05, 'clip_ratio/region_mean': 6.724781314915163e-05, 'epoch': 0.61} + + 65%|██████▍ | 663/1024 [30:24:36<16:46:14, 167.24s/it]INFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:49:35 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▍ | 664/1024 [30:27:20<16:37:47, 166.30s/it] + {'loss': 0.0642, 'grad_norm': 0.004661369137465954, 'learning_rate': 1e-05, 'num_tokens': 585916134.0, 'completions/mean_length': 7235.046875, 'completions/min_length': 1472.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7089.82568359375, 'completions/min_terminated_length': 1472.0, 'completions/max_terminated_length': 16363.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.322716623544693, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018921509385108948, 'sampling/sampling_logp_difference/max': 7.249154567718506, 'sampling/importance_sampling_ratio/min': 0.0007107750861905515, 'sampling/importance_sampling_ratio/mean': 1.0000330209732056, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8041050210595131, 'clip_ratio/low_mean': 3.626145735324826e-05, 'clip_ratio/low_min': 3.933786501875147e-06, 'clip_ratio/high_mean': 1.1574332802410936e-05, 'clip_ratio/high_max': 4.332071557655581e-05, 'clip_ratio/region_mean': 4.783579004197236e-05, 'epoch': 0.61} + + 65%|██████▍ | 664/1024 [30:27:20<16:37:47, 166.30s/it]INFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:52:19 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▍ | 665/1024 [30:30:06<16:35:59, 166.46s/it] + {'loss': 0.0538, 'grad_norm': 0.0024479639250785112, 'learning_rate': 1e-05, 'num_tokens': 586841633.0, 'completions/mean_length': 7077.5859375, 'completions/min_length': 944.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6777.37890625, 'completions/min_terminated_length': 944.0, 'completions/max_terminated_length': 13888.0, 'rewards/accuracy_reward/mean': 0.5, 'rewards/accuracy_reward/std': 0.5019646286964417, 'reward': 0.5, 'reward_std': 0.2001592218875885, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.019227145239710808, 'sampling/sampling_logp_difference/max': 8.160323143005371, 'sampling/importance_sampling_ratio/min': 0.00028577001648955047, 'sampling/importance_sampling_ratio/mean': 0.9998941421508789, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8417644873261452, 'clip_ratio/low_mean': 2.6745638365355262e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.737838710549113e-06, 'clip_ratio/high_max': 1.4951354842196452e-05, 'clip_ratio/region_mean': 3.0483477416964888e-05, 'epoch': 0.61} + + 65%|██████▍ | 665/1024 [30:30:06<16:35:59, 166.46s/it]INFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:55:06 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▌ | 666/1024 [30:33:15<17:11:59, 172.96s/it] + {'loss': 0.0352, 'grad_norm': 0.005297356750816107, 'learning_rate': 1e-05, 'num_tokens': 587897122.0, 'completions/mean_length': 8090.3203125, 'completions/min_length': 768.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 7463.0673828125, 'completions/min_terminated_length': 768.0, 'completions/max_terminated_length': 15900.0, 'rewards/accuracy_reward/mean': 0.2421875, 'rewards/accuracy_reward/std': 0.4300905168056488, 'reward': 0.2421875, 'reward_std': 0.27851754426956177, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018079372122883797, 'sampling/sampling_logp_difference/max': 7.353616237640381, 'sampling/importance_sampling_ratio/min': 0.0006402728031389415, 'sampling/importance_sampling_ratio/mean': 0.9999694228172302, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7603196427226067, 'clip_ratio/low_mean': 4.123253006582672e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.19675950272358e-06, 'clip_ratio/high_max': 1.7368187855026918e-05, 'clip_ratio/region_mean': 4.642928979592398e-05, 'epoch': 0.61} + + 65%|██████▌ | 666/1024 [30:33:15<17:11:59, 172.96s/it]INFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 19:58:14 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▌ | 667/1024 [30:35:55<16:46:21, 169.14s/it] + {'loss': 0.0695, 'grad_norm': 0.003049109596759081, 'learning_rate': 1e-05, 'num_tokens': 588801206.0, 'completions/mean_length': 6908.96875, 'completions/min_length': 406.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6360.826171875, 'completions/min_terminated_length': 406.0, 'completions/max_terminated_length': 15514.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.25460314750671387, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018563130870461464, 'sampling/sampling_logp_difference/max': 5.573733329772949, 'sampling/importance_sampling_ratio/min': 0.0037962812930345535, 'sampling/importance_sampling_ratio/mean': 0.9999892711639404, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7355617135763168, 'clip_ratio/low_mean': 2.9263440183058265e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.941788918382372e-06, 'clip_ratio/high_max': 1.5767155673529487e-05, 'clip_ratio/region_mean': 3.3205229101440636e-05, 'epoch': 0.61} + + 65%|██████▌ | 667/1024 [30:35:55<16:46:21, 169.14s/it]INFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:00:54 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▌ | 668/1024 [30:38:49<16:52:25, 170.63s/it] + {'loss': 0.1521, 'grad_norm': 0.0034495368599891663, 'learning_rate': 1e-05, 'num_tokens': 589732588.0, 'completions/mean_length': 7110.109375, 'completions/min_length': 1008.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 6810.951171875, 'completions/min_terminated_length': 1008.0, 'completions/max_terminated_length': 16333.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.326668381690979, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.016679491847753525, 'sampling/sampling_logp_difference/max': 7.4639434814453125, 'sampling/importance_sampling_ratio/min': 0.000573390512727201, 'sampling/importance_sampling_ratio/mean': 0.9999086856842041, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.688617967069149, 'clip_ratio/low_mean': 6.839358093202463e-05, 'clip_ratio/low_min': 9.10438984647044e-06, 'clip_ratio/high_mean': 4.312999067224155e-06, 'clip_ratio/high_max': 1.725199626889662e-05, 'clip_ratio/region_mean': 7.27065794308146e-05, 'epoch': 0.61} + + 65%|██████▌ | 668/1024 [30:38:49<16:52:25, 170.63s/it]INFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:03:49 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▌ | 669/1024 [30:41:44<16:56:45, 171.85s/it] + {'loss': 0.0258, 'grad_norm': 0.004971730522811413, 'learning_rate': 1e-05, 'num_tokens': 590717118.0, 'completions/mean_length': 7533.578125, 'completions/min_length': 1321.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7021.56982421875, 'completions/min_terminated_length': 1321.0, 'completions/max_terminated_length': 16263.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.30904704332351685, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01741175726056099, 'sampling/sampling_logp_difference/max': 9.96833324432373, 'sampling/importance_sampling_ratio/min': 4.6860604925313964e-05, 'sampling/importance_sampling_ratio/mean': 0.9998904466629028, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7306379675865173, 'clip_ratio/low_mean': 5.138145911587344e-05, 'clip_ratio/low_min': 3.9801311686460394e-06, 'clip_ratio/high_mean': 2.31802277994575e-06, 'clip_ratio/high_max': 5.049688752478687e-06, 'clip_ratio/region_mean': 5.369948189581919e-05, 'epoch': 0.62} + + 65%|██████▌ | 669/1024 [30:41:44<16:56:45, 171.85s/it]INFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:06:43 [block_pool.py:292] Successfully reset prefix cache + + 65%|██████▌ | 670/1024 [30:44:18<16:23:01, 166.61s/it] + {'loss': 0.0893, 'grad_norm': 0.003072877414524555, 'learning_rate': 1e-05, 'num_tokens': 591524494.0, 'completions/mean_length': 6165.0, 'completions/min_length': 1088.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6002.7939453125, 'completions/min_terminated_length': 1088.0, 'completions/max_terminated_length': 15983.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.28353992104530334, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016893092542886734, 'sampling/sampling_logp_difference/max': 8.5533447265625, 'sampling/importance_sampling_ratio/min': 0.00019289882038719952, 'sampling/importance_sampling_ratio/mean': 1.0000028610229492, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7227498516440392, 'clip_ratio/low_mean': 4.160707453593204e-05, 'clip_ratio/low_min': 7.402582014037762e-06, 'clip_ratio/high_mean': 3.4612473314155068e-06, 'clip_ratio/high_max': 1.3844989325662027e-05, 'clip_ratio/region_mean': 4.506832192419097e-05, 'epoch': 0.62} + + 65%|██████▌ | 670/1024 [30:44:18<16:23:01, 166.61s/it]INFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:09:18 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 671/1024 [30:46:54<16:01:07, 163.36s/it] + {'loss': 0.0619, 'grad_norm': 0.003992745652794838, 'learning_rate': 1e-05, 'num_tokens': 592320726.0, 'completions/mean_length': 6061.9375, 'completions/min_length': 973.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5728.9677734375, 'completions/min_terminated_length': 973.0, 'completions/max_terminated_length': 15451.0, 'rewards/accuracy_reward/mean': 0.578125, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.578125, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018718186765909195, 'sampling/sampling_logp_difference/max': 9.499366760253906, 'sampling/importance_sampling_ratio/min': 7.489924610126764e-05, 'sampling/importance_sampling_ratio/mean': 0.9999755620956421, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.813653938472271, 'clip_ratio/low_mean': 3.8767432329223084e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.71779502631398e-06, 'clip_ratio/high_max': 3.056439982174197e-05, 'clip_ratio/region_mean': 4.7485227241850225e-05, 'epoch': 0.62} + + 66%|██████▌ | 671/1024 [30:46:54<16:01:07, 163.36s/it]INFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:11:53 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 672/1024 [30:50:04<16:46:26, 171.55s/it] + {'loss': 0.1016, 'grad_norm': 0.003727070288732648, 'learning_rate': 1e-05, 'num_tokens': 593270695.0, 'completions/mean_length': 7265.9453125, 'completions/min_length': 913.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6658.0751953125, 'completions/min_terminated_length': 913.0, 'completions/max_terminated_length': 16210.0, 'rewards/accuracy_reward/mean': 0.4921875, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.4921875, 'reward_std': 0.30327796936035156, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.017805757001042366, 'sampling/sampling_logp_difference/max': 13.115309715270996, 'sampling/importance_sampling_ratio/min': 2.014157189478283e-06, 'sampling/importance_sampling_ratio/mean': 0.999910831451416, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7658502459526062, 'clip_ratio/low_mean': 3.851054543702048e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.138349368076888e-06, 'clip_ratio/high_max': 1.655339747230755e-05, 'clip_ratio/region_mean': 4.264889435035002e-05, 'epoch': 0.62} + + 66%|██████▌ | 672/1024 [30:50:04<16:46:26, 171.55s/it]INFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:15:04 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 673/1024 [30:53:14<17:15:09, 176.95s/it] + {'loss': 0.0535, 'grad_norm': 0.0038730741944164038, 'learning_rate': 1e-05, 'num_tokens': 594386261.0, 'completions/mean_length': 8564.046875, 'completions/min_length': 968.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 7680.0517578125, 'completions/min_terminated_length': 968.0, 'completions/max_terminated_length': 16355.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.32483339309692383, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01785116083920002, 'sampling/sampling_logp_difference/max': 8.660311698913574, 'sampling/importance_sampling_ratio/min': 0.00017333027790300548, 'sampling/importance_sampling_ratio/mean': 0.9999313354492188, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6856872886419296, 'clip_ratio/low_mean': 5.263989112336276e-05, 'clip_ratio/low_min': 1.2888257515442092e-05, 'clip_ratio/high_mean': 6.335726652650919e-06, 'clip_ratio/high_max': 2.0501698145380942e-05, 'clip_ratio/region_mean': 5.897561732126633e-05, 'epoch': 0.62} + + 66%|██████▌ | 673/1024 [30:53:14<17:15:09, 176.95s/it]INFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:18:14 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 674/1024 [30:56:08<17:07:17, 176.11s/it] + {'loss': 0.0565, 'grad_norm': 0.004014961421489716, 'learning_rate': 1e-05, 'num_tokens': 595407313.0, 'completions/mean_length': 7838.28125, 'completions/min_length': 872.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7343.900390625, 'completions/min_terminated_length': 872.0, 'completions/max_terminated_length': 16349.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.3148210048675537, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01619477942585945, 'sampling/sampling_logp_difference/max': 13.904884338378906, 'sampling/importance_sampling_ratio/min': 9.145037438429426e-07, 'sampling/importance_sampling_ratio/mean': 0.999966025352478, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.636501632630825, 'clip_ratio/low_mean': 4.970566510564822e-05, 'clip_ratio/low_min': 4.473552507988643e-06, 'clip_ratio/high_mean': 8.523603241883393e-06, 'clip_ratio/high_max': 2.6982705094269477e-05, 'clip_ratio/region_mean': 5.82292680064711e-05, 'epoch': 0.62} + + 66%|██████▌ | 674/1024 [30:56:08<17:07:17, 176.11s/it]INFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:21:08 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 675/1024 [30:59:05<17:05:21, 176.28s/it] + {'loss': 0.0855, 'grad_norm': 0.004226911347359419, 'learning_rate': 1e-05, 'num_tokens': 596291470.0, 'completions/mean_length': 6784.5390625, 'completions/min_length': 906.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 6229.1982421875, 'completions/min_terminated_length': 906.0, 'completions/max_terminated_length': 16323.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.2409384697675705, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.015974994748830795, 'sampling/sampling_logp_difference/max': 8.49952507019043, 'sampling/importance_sampling_ratio/min': 0.00020356501045171171, 'sampling/importance_sampling_ratio/mean': 0.9999697208404541, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6435417085886002, 'clip_ratio/low_mean': 2.8467071842896985e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.4123655773801147e-06, 'clip_ratio/high_max': 5.649462309520459e-06, 'clip_ratio/region_mean': 2.98794374202771e-05, 'epoch': 0.62} + + 66%|██████▌ | 675/1024 [30:59:05<17:05:21, 176.28s/it]INFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:24:04 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 676/1024 [31:02:07<17:13:25, 178.18s/it] + {'loss': 0.0675, 'grad_norm': 0.0031262668780982494, 'learning_rate': 1e-05, 'num_tokens': 597291107.0, 'completions/mean_length': 7650.2265625, 'completions/min_length': 1063.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6989.689453125, 'completions/min_terminated_length': 1063.0, 'completions/max_terminated_length': 16122.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2012200653553009, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.01801086962223053, 'sampling/sampling_logp_difference/max': 14.7490234375, 'sampling/importance_sampling_ratio/min': 3.9317012578976573e-07, 'sampling/importance_sampling_ratio/mean': 0.9998708963394165, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7500722259283066, 'clip_ratio/low_mean': 2.2315146964047017e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 2.2315146964047017e-05, 'epoch': 0.62} + + 66%|██████▌ | 676/1024 [31:02:07<17:13:25, 178.18s/it]INFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:27:07 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 677/1024 [31:05:01<17:03:20, 176.95s/it] + {'loss': 0.0759, 'grad_norm': 0.006345350295305252, 'learning_rate': 1e-05, 'num_tokens': 598129568.0, 'completions/mean_length': 6377.4140625, 'completions/min_length': 478.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6298.6220703125, 'completions/min_terminated_length': 478.0, 'completions/max_terminated_length': 15718.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.31929677724838257, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01888679713010788, 'sampling/sampling_logp_difference/max': 9.620697021484375, 'sampling/importance_sampling_ratio/min': 6.634136661887169e-05, 'sampling/importance_sampling_ratio/mean': 1.0000131130218506, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8221950903534889, 'clip_ratio/low_mean': 5.510050823431811e-05, 'clip_ratio/low_min': 4.993807579012355e-06, 'clip_ratio/high_mean': 5.693989294286439e-06, 'clip_ratio/high_max': 2.2775957177145756e-05, 'clip_ratio/region_mean': 6.079449713070062e-05, 'epoch': 0.62} + + 66%|██████▌ | 677/1024 [31:05:01<17:03:20, 176.95s/it]INFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:30:01 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▌ | 678/1024 [31:07:45<16:36:23, 172.78s/it] + {'loss': 0.0811, 'grad_norm': 0.003562809666618705, 'learning_rate': 1e-05, 'num_tokens': 598862361.0, 'completions/mean_length': 5567.2578125, 'completions/min_length': 927.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 5218.33056640625, 'completions/min_terminated_length': 927.0, 'completions/max_terminated_length': 14250.0, 'rewards/accuracy_reward/mean': 0.5703125, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.5703125, 'reward_std': 0.2698703408241272, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.016367387026548386, 'sampling/sampling_logp_difference/max': 15.2844877243042, 'sampling/importance_sampling_ratio/min': 2.3016077932425105e-07, 'sampling/importance_sampling_ratio/mean': 0.9999499320983887, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7284790053963661, 'clip_ratio/low_mean': 5.5144641464721644e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.139227250263502e-06, 'clip_ratio/high_max': 2.3920926196296932e-05, 'clip_ratio/region_mean': 6.228386882867198e-05, 'epoch': 0.62} + + 66%|██████▌ | 678/1024 [31:07:45<16:36:23, 172.78s/it]INFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:32:44 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▋ | 679/1024 [31:10:39<16:35:49, 173.19s/it] + {'loss': 0.0517, 'grad_norm': 0.0031477995216846466, 'learning_rate': 1e-05, 'num_tokens': 599921233.0, 'completions/mean_length': 8128.375, 'completions/min_length': 1066.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7578.00048828125, 'completions/min_terminated_length': 1066.0, 'completions/max_terminated_length': 15202.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.21542152762413025, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.019213391467928886, 'sampling/sampling_logp_difference/max': 8.569145202636719, 'sampling/importance_sampling_ratio/min': 0.00018987487419508398, 'sampling/importance_sampling_ratio/mean': 0.9999460577964783, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7838430106639862, 'clip_ratio/low_mean': 2.498499657122011e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.364888013697055e-06, 'clip_ratio/high_max': 1.4490571629721671e-05, 'clip_ratio/region_mean': 2.934988481229084e-05, 'epoch': 0.62} + + 66%|██████▋ | 679/1024 [31:10:39<16:35:49, 173.19s/it]INFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:35:38 [block_pool.py:292] Successfully reset prefix cache + + 66%|██████▋ | 680/1024 [31:13:29<16:27:32, 172.25s/it] + {'loss': 0.0896, 'grad_norm': 0.0034168637357652187, 'learning_rate': 1e-05, 'num_tokens': 600895023.0, 'completions/mean_length': 7452.296875, 'completions/min_length': 799.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 7013.0322265625, 'completions/min_terminated_length': 799.0, 'completions/max_terminated_length': 15879.0, 'rewards/accuracy_reward/mean': 0.296875, 'rewards/accuracy_reward/std': 0.45867621898651123, 'reward': 0.296875, 'reward_std': 0.3061561584472656, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019955754280090332, 'sampling/sampling_logp_difference/max': 7.843585014343262, 'sampling/importance_sampling_ratio/min': 0.0003922602627426386, 'sampling/importance_sampling_ratio/mean': 0.9999901056289673, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8657966181635857, 'clip_ratio/low_mean': 3.322141196804296e-05, 'clip_ratio/low_min': 2.5509161787340418e-06, 'clip_ratio/high_mean': 8.023214263630507e-06, 'clip_ratio/high_max': 2.650051692398847e-05, 'clip_ratio/region_mean': 4.124462532217876e-05, 'epoch': 0.63} + + 66%|██████▋ | 680/1024 [31:13:29<16:27:32, 172.25s/it]INFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:38:28 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 681/1024 [31:16:16<16:16:19, 170.78s/it] + {'loss': 0.105, 'grad_norm': 0.0026859277859330177, 'learning_rate': 1e-05, 'num_tokens': 601887935.0, 'completions/mean_length': 7581.625, 'completions/min_length': 1686.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0546875, 'completions/mean_terminated_length': 7072.396484375, 'completions/min_terminated_length': 1686.0, 'completions/max_terminated_length': 15759.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.3295465111732483, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018653862178325653, 'sampling/sampling_logp_difference/max': 3.5232622623443604, 'sampling/importance_sampling_ratio/min': 0.029503032565116882, 'sampling/importance_sampling_ratio/mean': 0.9999804496765137, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.764233261346817, 'clip_ratio/low_mean': 5.516502255886735e-05, 'clip_ratio/low_min': 5.772084023192292e-06, 'clip_ratio/high_mean': 2.0586570599334664e-06, 'clip_ratio/high_max': 8.234628239733865e-06, 'clip_ratio/region_mean': 5.7223681096729706e-05, 'epoch': 0.63} + + 67%|██████▋ | 681/1024 [31:16:16<16:16:19, 170.78s/it]INFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:41:16 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 682/1024 [31:19:16<16:29:32, 173.60s/it] + {'loss': 0.0491, 'grad_norm': 0.002624326851218939, 'learning_rate': 1e-05, 'num_tokens': 603035462.0, 'completions/mean_length': 8824.2421875, 'completions/min_length': 1991.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.046875, 'completions/mean_terminated_length': 8452.4501953125, 'completions/min_terminated_length': 1991.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.328125, 'rewards/accuracy_reward/std': 0.4713755249977112, 'reward': 0.328125, 'reward_std': 0.2688094973564148, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01799936406314373, 'sampling/sampling_logp_difference/max': 9.874824523925781, 'sampling/importance_sampling_ratio/min': 5.1453887863317505e-05, 'sampling/importance_sampling_ratio/mean': 0.9999333024024963, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7557987719774246, 'clip_ratio/low_mean': 5.129833289174712e-05, 'clip_ratio/low_min': 5.234505806583911e-06, 'clip_ratio/high_mean': 6.635149020439712e-06, 'clip_ratio/high_max': 2.654059608175885e-05, 'clip_ratio/region_mean': 5.793348100269213e-05, 'epoch': 0.63} + + 67%|██████▋ | 682/1024 [31:19:16<16:29:32, 173.60s/it]INFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:44:16 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 683/1024 [31:22:02<16:13:28, 171.29s/it] + {'loss': 0.0796, 'grad_norm': 0.005783884786069393, 'learning_rate': 1e-05, 'num_tokens': 603801083.0, 'completions/mean_length': 5832.7890625, 'completions/min_length': 948.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 5749.70849609375, 'completions/min_terminated_length': 948.0, 'completions/max_terminated_length': 16189.0, 'rewards/accuracy_reward/mean': 0.5234375, 'rewards/accuracy_reward/std': 0.5014128684997559, 'reward': 0.5234375, 'reward_std': 0.27092626690864563, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01805710420012474, 'sampling/sampling_logp_difference/max': 3.399966239929199, 'sampling/importance_sampling_ratio/min': 0.033374395221471786, 'sampling/importance_sampling_ratio/mean': 1.0000687837600708, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8034545630216599, 'clip_ratio/low_mean': 3.1395032920045196e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 3.1395032920045196e-05, 'epoch': 0.63} + + 67%|██████▋ | 683/1024 [31:22:02<16:13:28, 171.29s/it]INFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:47:02 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 684/1024 [31:24:54<16:12:00, 171.53s/it] + {'loss': 0.0832, 'grad_norm': 0.005038067698478699, 'learning_rate': 1e-05, 'num_tokens': 604748150.0, 'completions/mean_length': 7247.4609375, 'completions/min_length': 416.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 7102.43701171875, 'completions/min_terminated_length': 416.0, 'completions/max_terminated_length': 16221.0, 'rewards/accuracy_reward/mean': 0.46875, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.46875, 'reward_std': 0.43106767535209656, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.01983889564871788, 'sampling/sampling_logp_difference/max': 5.781791687011719, 'sampling/importance_sampling_ratio/min': 0.0030831864569336176, 'sampling/importance_sampling_ratio/mean': 0.9999319314956665, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.908146396279335, 'clip_ratio/low_mean': 5.521000275621191e-05, 'clip_ratio/low_min': 9.064021924132248e-06, 'clip_ratio/high_mean': 6.736250270478195e-06, 'clip_ratio/high_max': 2.2193052700458793e-05, 'clip_ratio/region_mean': 6.19462530266901e-05, 'epoch': 0.63} + + 67%|██████▋ | 684/1024 [31:24:54<16:12:00, 171.53s/it]INFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:49:54 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 685/1024 [31:27:35<15:50:10, 168.17s/it] + {'loss': 0.0967, 'grad_norm': 0.0053928992711007595, 'learning_rate': 1e-05, 'num_tokens': 605642768.0, 'completions/mean_length': 6861.078125, 'completions/min_length': 530.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0390625, 'completions/mean_terminated_length': 6473.96728515625, 'completions/min_terminated_length': 530.0, 'completions/max_terminated_length': 15519.0, 'rewards/accuracy_reward/mean': 0.5078125, 'rewards/accuracy_reward/std': 0.5019033551216125, 'reward': 0.5078125, 'reward_std': 0.40503159165382385, 'frac_reward_zero_std': 0.125, 'sampling/sampling_logp_difference/mean': 0.018197370693087578, 'sampling/sampling_logp_difference/max': 9.99008560180664, 'sampling/importance_sampling_ratio/min': 4.585228089126758e-05, 'sampling/importance_sampling_ratio/mean': 0.9999208450317383, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7612876370549202, 'clip_ratio/low_mean': 6.599987852951017e-05, 'clip_ratio/low_min': 1.7551100199852954e-05, 'clip_ratio/high_mean': 2.157538972369366e-06, 'clip_ratio/high_max': 8.630155889477464e-06, 'clip_ratio/region_mean': 6.815741778609663e-05, 'epoch': 0.63} + + 67%|██████▋ | 685/1024 [31:27:35<15:50:10, 168.17s/it]INFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:52:34 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 686/1024 [31:30:24<15:50:11, 168.67s/it] + {'loss': 0.1155, 'grad_norm': 0.0032709913793951273, 'learning_rate': 1e-05, 'num_tokens': 606534577.0, 'completions/mean_length': 6837.8203125, 'completions/min_length': 558.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 6201.40869140625, 'completions/min_terminated_length': 558.0, 'completions/max_terminated_length': 16310.0, 'rewards/accuracy_reward/mean': 0.484375, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.484375, 'reward_std': 0.2567248046398163, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016331009566783905, 'sampling/sampling_logp_difference/max': 10.062474250793457, 'sampling/importance_sampling_ratio/min': 4.2650382965803146e-05, 'sampling/importance_sampling_ratio/mean': 0.9999561309814453, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6217481270432472, 'clip_ratio/low_mean': 5.132838714416721e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.32969795333338e-06, 'clip_ratio/high_max': 2.531879181333352e-05, 'clip_ratio/region_mean': 5.765808464275324e-05, 'epoch': 0.63} + + 67%|██████▋ | 686/1024 [31:30:24<15:50:11, 168.67s/it]INFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:55:24 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 687/1024 [31:33:08<15:38:48, 167.15s/it] + {'loss': 0.0581, 'grad_norm': 0.002640153281390667, 'learning_rate': 1e-05, 'num_tokens': 607381811.0, 'completions/mean_length': 6458.703125, 'completions/min_length': 454.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6380.55126953125, 'completions/min_terminated_length': 454.0, 'completions/max_terminated_length': 15103.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.2382800281047821, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.017693117260932922, 'sampling/sampling_logp_difference/max': 13.93669319152832, 'sampling/importance_sampling_ratio/min': 8.858721116666857e-07, 'sampling/importance_sampling_ratio/mean': 1.0000228881835938, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7460968196392059, 'clip_ratio/low_mean': 5.021198876420385e-05, 'clip_ratio/low_min': 4.219409220240777e-06, 'clip_ratio/high_mean': 3.581897317417315e-06, 'clip_ratio/high_max': 1.0992388070008019e-05, 'clip_ratio/region_mean': 5.379388539950014e-05, 'epoch': 0.63} + + 67%|██████▋ | 687/1024 [31:33:08<15:38:48, 167.15s/it]INFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 20:58:08 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 688/1024 [31:35:53<15:32:14, 166.47s/it] + {'loss': 0.0635, 'grad_norm': 0.003713687416166067, 'learning_rate': 1e-05, 'num_tokens': 608302256.0, 'completions/mean_length': 7043.1640625, 'completions/min_length': 952.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6894.8974609375, 'completions/min_terminated_length': 952.0, 'completions/max_terminated_length': 16121.0, 'rewards/accuracy_reward/mean': 0.390625, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.390625, 'reward_std': 0.2648528814315796, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019254781305789948, 'sampling/sampling_logp_difference/max': 16.12498664855957, 'sampling/importance_sampling_ratio/min': 9.931326871992496e-08, 'sampling/importance_sampling_ratio/mean': 0.9999112486839294, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7884078621864319, 'clip_ratio/low_mean': 6.0473582834674744e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.1366002417598793e-06, 'clip_ratio/high_max': 1.2546400967039517e-05, 'clip_ratio/region_mean': 6.361018404277274e-05, 'epoch': 0.63} + + 67%|██████▋ | 688/1024 [31:35:53<15:32:14, 166.47s/it]INFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:00:53 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 689/1024 [31:38:48<15:44:30, 169.17s/it] + {'loss': 0.0275, 'grad_norm': 0.004894682671874762, 'learning_rate': 1e-05, 'num_tokens': 609348299.0, 'completions/mean_length': 8012.8359375, 'completions/min_length': 866.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7742.79833984375, 'completions/min_terminated_length': 866.0, 'completions/max_terminated_length': 15487.0, 'rewards/accuracy_reward/mean': 0.4296875, 'rewards/accuracy_reward/std': 0.4969765841960907, 'reward': 0.4296875, 'reward_std': 0.3027411997318268, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01958826184272766, 'sampling/sampling_logp_difference/max': 6.1424455642700195, 'sampling/importance_sampling_ratio/min': 0.0021496599074453115, 'sampling/importance_sampling_ratio/mean': 1.0000343322753906, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8368816301226616, 'clip_ratio/low_mean': 4.4303845015747356e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.971898143438011e-06, 'clip_ratio/high_max': 7.887592573752045e-06, 'clip_ratio/region_mean': 4.6275743216028786e-05, 'epoch': 0.63} + + 67%|██████▋ | 689/1024 [31:38:48<15:44:30, 169.17s/it]INFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:03:48 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 690/1024 [31:41:48<15:58:40, 172.22s/it] + {'loss': 0.0574, 'grad_norm': 0.0039004215504974127, 'learning_rate': 1e-05, 'num_tokens': 610341090.0, 'completions/mean_length': 7594.3671875, 'completions/min_length': 807.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0625, 'completions/mean_terminated_length': 7008.39208984375, 'completions/min_terminated_length': 807.0, 'completions/max_terminated_length': 16065.0, 'rewards/accuracy_reward/mean': 0.3984375, 'rewards/accuracy_reward/std': 0.4915000796318054, 'reward': 0.3984375, 'reward_std': 0.3284856975078583, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.01734849065542221, 'sampling/sampling_logp_difference/max': 10.124999046325684, 'sampling/importance_sampling_ratio/min': 4.006533345091157e-05, 'sampling/importance_sampling_ratio/mean': 0.9999041557312012, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.692665733397007, 'clip_ratio/low_mean': 3.859445814669016e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.6727028625828098e-06, 'clip_ratio/high_max': 1.0690811450331239e-05, 'clip_ratio/region_mean': 4.1267160668212455e-05, 'epoch': 0.63} + + 67%|██████▋ | 690/1024 [31:41:48<15:58:40, 172.22s/it]INFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:06:47 [block_pool.py:292] Successfully reset prefix cache + + 67%|██████▋ | 691/1024 [31:44:47<16:06:46, 174.19s/it] + {'loss': 0.0782, 'grad_norm': 0.004913663491606712, 'learning_rate': 1e-05, 'num_tokens': 611339726.0, 'completions/mean_length': 7640.09375, 'completions/min_length': 826.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.03125, 'completions/mean_terminated_length': 7358.0322265625, 'completions/min_terminated_length': 826.0, 'completions/max_terminated_length': 16292.0, 'rewards/accuracy_reward/mean': 0.359375, 'rewards/accuracy_reward/std': 0.481702595949173, 'reward': 0.359375, 'reward_std': 0.3356248140335083, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.019604282453656197, 'sampling/sampling_logp_difference/max': 16.983896255493164, 'sampling/importance_sampling_ratio/min': 4.2071459205317296e-08, 'sampling/importance_sampling_ratio/mean': 0.9998912811279297, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8469130471348763, 'clip_ratio/low_mean': 5.9335616697353544e-05, 'clip_ratio/low_min': 5.472375505632954e-06, 'clip_ratio/high_mean': 2.7999831218039617e-06, 'clip_ratio/high_max': 4.406994776218198e-06, 'clip_ratio/region_mean': 6.21355998191575e-05, 'epoch': 0.64} + + 67%|██████▋ | 691/1024 [31:44:47<16:06:46, 174.19s/it]INFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:09:46 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 692/1024 [31:47:17<15:24:17, 167.04s/it] + {'loss': 0.0371, 'grad_norm': 0.0032354791183024645, 'learning_rate': 1e-05, 'num_tokens': 612005495.0, 'completions/mean_length': 5063.6953125, 'completions/min_length': 319.0, 'completions/max_length': 15895.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5063.6953125, 'completions/min_terminated_length': 319.0, 'completions/max_terminated_length': 15895.0, 'rewards/accuracy_reward/mean': 0.59375, 'rewards/accuracy_reward/std': 0.4930621087551117, 'reward': 0.59375, 'reward_std': 0.18990948796272278, 'frac_reward_zero_std': 0.5625, 'sampling/sampling_logp_difference/mean': 0.0178166925907135, 'sampling/sampling_logp_difference/max': 3.8934366703033447, 'sampling/importance_sampling_ratio/min': 0.02037520334124565, 'sampling/importance_sampling_ratio/mean': 0.9999009370803833, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7586102113127708, 'clip_ratio/low_mean': 2.7830240469484124e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.209913979342673e-06, 'clip_ratio/high_max': 1.4971937162044924e-05, 'clip_ratio/region_mean': 3.304015490357415e-05, 'epoch': 0.64} + + 68%|██████▊ | 692/1024 [31:47:17<15:24:17, 167.04s/it]INFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:12:17 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 693/1024 [31:49:53<15:04:06, 163.89s/it] + {'loss': 0.0357, 'grad_norm': 0.004039868246763945, 'learning_rate': 1e-05, 'num_tokens': 612870060.0, 'completions/mean_length': 6542.1640625, 'completions/min_length': 665.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6385.94482421875, 'completions/min_terminated_length': 665.0, 'completions/max_terminated_length': 14868.0, 'rewards/accuracy_reward/mean': 0.453125, 'rewards/accuracy_reward/std': 0.4997538626194, 'reward': 0.453125, 'reward_std': 0.2590789198875427, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.019780561327934265, 'sampling/sampling_logp_difference/max': 24.499982833862305, 'sampling/importance_sampling_ratio/min': 2.2897740994953786e-11, 'sampling/importance_sampling_ratio/mean': 0.9998836517333984, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.847448967397213, 'clip_ratio/low_mean': 1.1576638144106255e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.3344492698670365e-06, 'clip_ratio/high_max': 2.1337797079468146e-05, 'clip_ratio/region_mean': 1.691108741397329e-05, 'epoch': 0.64} + + 68%|██████▊ | 693/1024 [31:49:53<15:04:06, 163.89s/it]INFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:14:53 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 694/1024 [31:52:36<14:59:16, 163.50s/it] + {'loss': 0.0632, 'grad_norm': 0.0023007066920399666, 'learning_rate': 1e-05, 'num_tokens': 613633581.0, 'completions/mean_length': 5805.8203125, 'completions/min_length': 919.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5551.9443359375, 'completions/min_terminated_length': 919.0, 'completions/max_terminated_length': 16287.0, 'rewards/accuracy_reward/mean': 0.609375, 'rewards/accuracy_reward/std': 0.4898075461387634, 'reward': 0.609375, 'reward_std': 0.23857943713665009, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.016993921250104904, 'sampling/sampling_logp_difference/max': 8.249631881713867, 'sampling/importance_sampling_ratio/min': 0.00026135475491173565, 'sampling/importance_sampling_ratio/mean': 1.0000262260437012, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6972410827875137, 'clip_ratio/low_mean': 3.4833526569855167e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.5834565374461818e-06, 'clip_ratio/high_max': 6.333826149784727e-06, 'clip_ratio/region_mean': 3.641698299361451e-05, 'epoch': 0.64} + + 68%|██████▊ | 694/1024 [31:52:36<14:59:16, 163.50s/it]INFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:17:36 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 695/1024 [31:55:51<15:47:57, 172.88s/it] + {'loss': 0.0413, 'grad_norm': 0.0029130352195352316, 'learning_rate': 1e-05, 'num_tokens': 614611881.0, 'completions/mean_length': 7504.65625, 'completions/min_length': 843.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.09375, 'completions/mean_terminated_length': 6586.103515625, 'completions/min_terminated_length': 843.0, 'completions/max_terminated_length': 16249.0, 'rewards/accuracy_reward/mean': 0.3671875, 'rewards/accuracy_reward/std': 0.4839322865009308, 'reward': 0.3671875, 'reward_std': 0.23250606656074524, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.018666472285985947, 'sampling/sampling_logp_difference/max': 13.109896659851074, 'sampling/importance_sampling_ratio/min': 2.025089543167269e-06, 'sampling/importance_sampling_ratio/mean': 0.999863862991333, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7908455803990364, 'clip_ratio/low_mean': 1.501361566624837e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.6609881754447997e-06, 'clip_ratio/high_max': 6.643952701779199e-06, 'clip_ratio/region_mean': 1.667460389853659e-05, 'epoch': 0.64} + + 68%|██████▊ | 695/1024 [31:55:51<15:47:57, 172.88s/it]INFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:20:50 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 696/1024 [31:57:56<14:27:16, 158.65s/it] + {'loss': 0.1054, 'grad_norm': 0.0020515238866209984, 'learning_rate': 1e-05, 'num_tokens': 615355915.0, 'completions/mean_length': 5627.265625, 'completions/min_length': 233.0, 'completions/max_length': 13984.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 5627.265625, 'completions/min_terminated_length': 233.0, 'completions/max_terminated_length': 13984.0, 'rewards/accuracy_reward/mean': 0.421875, 'rewards/accuracy_reward/std': 0.4957992732524872, 'reward': 0.421875, 'reward_std': 0.26827272772789, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01694992370903492, 'sampling/sampling_logp_difference/max': 5.874999046325684, 'sampling/importance_sampling_ratio/min': 0.002808797173202038, 'sampling/importance_sampling_ratio/mean': 0.9999716877937317, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7167766839265823, 'clip_ratio/low_mean': 5.670640712196473e-05, 'clip_ratio/low_min': 6.148246484372066e-06, 'clip_ratio/high_mean': 4.543699901660148e-06, 'clip_ratio/high_max': 1.817479960664059e-05, 'clip_ratio/region_mean': 6.125010668256436e-05, 'epoch': 0.64} + + 68%|██████▊ | 696/1024 [31:57:56<14:27:16, 158.65s/it]INFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:22:56 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 697/1024 [32:00:46<14:43:01, 162.02s/it] + {'loss': 0.0428, 'grad_norm': 0.003425017697736621, 'learning_rate': 1e-05, 'num_tokens': 616159416.0, 'completions/mean_length': 6129.9140625, 'completions/min_length': 1201.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5967.1513671875, 'completions/min_terminated_length': 1201.0, 'completions/max_terminated_length': 14713.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2188364714384079, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01828661933541298, 'sampling/sampling_logp_difference/max': 5.187221050262451, 'sampling/importance_sampling_ratio/min': 0.005587513092905283, 'sampling/importance_sampling_ratio/mean': 0.9999443292617798, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7654511705040932, 'clip_ratio/low_mean': 5.3280599786376115e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 5.3280599786376115e-05, 'epoch': 0.64} + + 68%|██████▊ | 697/1024 [32:00:46<14:43:01, 162.02s/it]INFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:25:46 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 698/1024 [32:03:31<14:45:22, 162.95s/it] + {'loss': 0.0607, 'grad_norm': 0.005707201547920704, 'learning_rate': 1e-05, 'num_tokens': 617101738.0, 'completions/mean_length': 7219.078125, 'completions/min_length': 649.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 7146.91357421875, 'completions/min_terminated_length': 649.0, 'completions/max_terminated_length': 16340.0, 'rewards/accuracy_reward/mean': 0.53125, 'rewards/accuracy_reward/std': 0.5009832978248596, 'reward': 0.53125, 'reward_std': 0.2835350036621094, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.01983051374554634, 'sampling/sampling_logp_difference/max': 12.874998092651367, 'sampling/importance_sampling_ratio/min': 2.5612937406549463e-06, 'sampling/importance_sampling_ratio/mean': 0.9999914765357971, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.847568191587925, 'clip_ratio/low_mean': 3.4785461366482195e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.459671456264914e-06, 'clip_ratio/high_max': 2.1838685825059656e-05, 'clip_ratio/region_mean': 4.024513225431292e-05, 'epoch': 0.64} + + 68%|██████▊ | 698/1024 [32:03:31<14:45:22, 162.95s/it]INFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:28:31 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 699/1024 [32:06:05<14:27:07, 160.09s/it] + {'loss': 0.078, 'grad_norm': 0.004018646199256182, 'learning_rate': 1e-05, 'num_tokens': 617903030.0, 'completions/mean_length': 6116.96875, 'completions/min_length': 1371.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5870.56005859375, 'completions/min_terminated_length': 1371.0, 'completions/max_terminated_length': 14972.0, 'rewards/accuracy_reward/mean': 0.5546875, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.5546875, 'reward_std': 0.2569621503353119, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017638593912124634, 'sampling/sampling_logp_difference/max': 8.749999046325684, 'sampling/importance_sampling_ratio/min': 0.00015846146561671048, 'sampling/importance_sampling_ratio/mean': 0.9999732971191406, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7148991823196411, 'clip_ratio/low_mean': 5.492671812135086e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.55213056638604e-06, 'clip_ratio/high_max': 2.676450185390422e-05, 'clip_ratio/region_mean': 6.347884914248425e-05, 'epoch': 0.64} + + 68%|██████▊ | 699/1024 [32:06:05<14:27:07, 160.09s/it]INFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:31:04 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 700/1024 [32:08:57<14:44:19, 163.76s/it] + {'loss': 0.0617, 'grad_norm': 0.00282766274176538, 'learning_rate': 1e-05, 'num_tokens': 618880312.0, 'completions/mean_length': 7486.515625, 'completions/min_length': 611.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 7272.9765625, 'completions/min_terminated_length': 611.0, 'completions/max_terminated_length': 15232.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32089442014694214, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01940794661641121, 'sampling/sampling_logp_difference/max': 9.180124282836914, 'sampling/importance_sampling_ratio/min': 0.0001030677231028676, 'sampling/importance_sampling_ratio/mean': 0.9999787211418152, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7912377193570137, 'clip_ratio/low_mean': 7.103690825260855e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.9004990008397726e-06, 'clip_ratio/high_max': 3.844970706268214e-06, 'clip_ratio/region_mean': 7.29374083903167e-05, 'epoch': 0.64} + + 68%|██████▊ | 700/1024 [32:08:57<14:44:19, 163.76s/it]INFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:33:57 [block_pool.py:292] Successfully reset prefix cache + + 68%|██████▊ | 701/1024 [32:12:07<15:24:30, 171.73s/it] + {'loss': 0.0438, 'grad_norm': 0.0016839519375935197, 'learning_rate': 1e-05, 'num_tokens': 619834002.0, 'completions/mean_length': 7297.453125, 'completions/min_length': 743.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0703125, 'completions/mean_terminated_length': 6610.23583984375, 'completions/min_terminated_length': 743.0, 'completions/max_terminated_length': 13644.0, 'rewards/accuracy_reward/mean': 0.3359375, 'rewards/accuracy_reward/std': 0.47417303919792175, 'reward': 0.3359375, 'reward_std': 0.2801200747489929, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.019356656819581985, 'sampling/sampling_logp_difference/max': 7.59285831451416, 'sampling/importance_sampling_ratio/min': 0.0005040382966399193, 'sampling/importance_sampling_ratio/mean': 0.9999658465385437, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8420139253139496, 'clip_ratio/low_mean': 3.103233757428825e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.616161613237637e-06, 'clip_ratio/high_max': 2.241842275907402e-05, 'clip_ratio/region_mean': 3.76484995285864e-05, 'epoch': 0.64} + + 68%|██████▊ | 701/1024 [32:12:07<15:24:30, 171.73s/it]INFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:37:07 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-02 21:38:49,411 - math_verify.grader - WARNING - Timeout during comparison + + 69%|██████▊ | 702/1024 [32:14:48<15:03:21, 168.33s/it] + {'loss': 0.0822, 'grad_norm': 0.00550073804333806, 'learning_rate': 1e-05, 'num_tokens': 620615054.0, 'completions/mean_length': 5935.53125, 'completions/min_length': 632.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5684.76806640625, 'completions/min_terminated_length': 632.0, 'completions/max_terminated_length': 15471.0, 'rewards/accuracy_reward/mean': 0.4609375, 'rewards/accuracy_reward/std': 0.5004304051399231, 'reward': 0.4609375, 'reward_std': 0.3366856575012207, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01712688058614731, 'sampling/sampling_logp_difference/max': 10.624999046325684, 'sampling/importance_sampling_ratio/min': 2.4300854420289397e-05, 'sampling/importance_sampling_ratio/mean': 1.0000221729278564, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.6855737417936325, 'clip_ratio/low_mean': 4.7923438614816405e-05, 'clip_ratio/low_min': 3.219243353669299e-06, 'clip_ratio/high_mean': 2.447962742735399e-06, 'clip_ratio/high_max': 9.791850970941596e-06, 'clip_ratio/region_mean': 5.0371401357551804e-05, 'epoch': 0.65} + + 69%|██████▊ | 702/1024 [32:14:48<15:03:21, 168.33s/it]INFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:39:47 [block_pool.py:292] Successfully reset prefix cache + + 69%|██████▊ | 703/1024 [32:17:30<14:51:09, 166.57s/it] + {'loss': 0.0922, 'grad_norm': 0.005174044985324144, 'learning_rate': 1e-05, 'num_tokens': 621407854.0, 'completions/mean_length': 6016.0625, 'completions/min_length': 986.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 5851.4921875, 'completions/min_terminated_length': 986.0, 'completions/max_terminated_length': 14395.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.25330984592437744, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.017704609781503677, 'sampling/sampling_logp_difference/max': 10.249993324279785, 'sampling/importance_sampling_ratio/min': 3.535773794283159e-05, 'sampling/importance_sampling_ratio/mean': 0.9999493956565857, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7685846760869026, 'clip_ratio/low_mean': 2.6169475859205704e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 3.3923624869203195e-06, 'clip_ratio/high_max': 1.3569449947681278e-05, 'clip_ratio/region_mean': 2.95618385734997e-05, 'epoch': 0.65} + + 69%|██████▊ | 703/1024 [32:17:30<14:51:09, 166.57s/it]INFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:42:30 [block_pool.py:292] Successfully reset prefix cache + + 69%|██████▉ | 704/1024 [32:20:09<14:36:03, 164.26s/it] + {'loss': 0.1083, 'grad_norm': 0.0022989478893578053, 'learning_rate': 1e-05, 'num_tokens': 622246633.0, 'completions/mean_length': 6402.3984375, 'completions/min_length': 443.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6323.80322265625, 'completions/min_terminated_length': 443.0, 'completions/max_terminated_length': 15771.0, 'rewards/accuracy_reward/mean': 0.4453125, 'rewards/accuracy_reward/std': 0.4989531338214874, 'reward': 0.4453125, 'reward_std': 0.32089439034461975, 'frac_reward_zero_std': 0.25, 'sampling/sampling_logp_difference/mean': 0.01977568492293358, 'sampling/sampling_logp_difference/max': 14.645465850830078, 'sampling/importance_sampling_ratio/min': 4.360687739790592e-07, 'sampling/importance_sampling_ratio/mean': 0.9999479055404663, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8285454586148262, 'clip_ratio/low_mean': 3.712984198500635e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.2330010551740997e-06, 'clip_ratio/high_max': 8.932004220696399e-06, 'clip_ratio/region_mean': 3.936284304018045e-05, 'epoch': 0.65} + + 69%|██████▉ | 704/1024 [32:20:09<14:36:03, 164.26s/it]INFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:45:13 [block_pool.py:292] Successfully reset prefix cache +/mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( + + 69%|██████▉ | 705/1024 [32:22:47<14:23:54, 162.49s/it] + {'loss': 0.0586, 'grad_norm': 0.0039679198525846004, 'learning_rate': 1e-05, 'num_tokens': 623047420.0, 'completions/mean_length': 6085.7734375, 'completions/min_length': 559.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5838.6162109375, 'completions/min_terminated_length': 559.0, 'completions/max_terminated_length': 16192.0, 'rewards/accuracy_reward/mean': 0.3828125, 'rewards/accuracy_reward/std': 0.4879830479621887, 'reward': 0.3828125, 'reward_std': 0.2477683573961258, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.01756519451737404, 'sampling/sampling_logp_difference/max': 10.26925277709961, 'sampling/importance_sampling_ratio/min': 3.468328213784844e-05, 'sampling/importance_sampling_ratio/mean': 0.9999336004257202, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.788465715944767, 'clip_ratio/low_mean': 3.0628862987214234e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 1.1193895943506504e-06, 'clip_ratio/high_max': 4.477558377402602e-06, 'clip_ratio/region_mean': 3.174825269525172e-05, 'epoch': 0.65} + + 69%|██████▉ | 705/1024 [32:22:47<14:23:54, 162.49s/it]INFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:47:47 [block_pool.py:292] Successfully reset prefix cache + + 69%|██████▉ | 706/1024 [32:25:13<13:54:52, 157.52s/it] + {'loss': 0.0116, 'grad_norm': 0.005460201762616634, 'learning_rate': 1e-05, 'num_tokens': 623879902.0, 'completions/mean_length': 6338.265625, 'completions/min_length': 757.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6259.16552734375, 'completions/min_terminated_length': 757.0, 'completions/max_terminated_length': 15766.0, 'rewards/accuracy_reward/mean': 0.34375, 'rewards/accuracy_reward/std': 0.47682511806488037, 'reward': 0.34375, 'reward_std': 0.21884137392044067, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.02087930217385292, 'sampling/sampling_logp_difference/max': 4.832030773162842, 'sampling/importance_sampling_ratio/min': 0.007970319129526615, 'sampling/importance_sampling_ratio/mean': 1.000030279159546, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9556885957717896, 'clip_ratio/low_mean': 1.3522747963179427e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 4.254188070262899e-06, 'clip_ratio/high_max': 1.241185282196966e-05, 'clip_ratio/region_mean': 1.7776936260816e-05, 'epoch': 0.65} + + 69%|██████▉ | 706/1024 [32:25:13<13:54:52, 157.52s/it]INFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:50:13 [block_pool.py:292] Successfully reset prefix cache +[OpenTinker] 2025-12-02 21:52:02,228 - math_verify.grader - WARNING - Timeout during comparison + + 69%|██████▉ | 707/1024 [32:28:06<14:16:36, 162.14s/it] + {'loss': 0.0485, 'grad_norm': 0.0026750562246888876, 'learning_rate': 1e-05, 'num_tokens': 624797851.0, 'completions/mean_length': 6995.6015625, 'completions/min_length': 1643.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.015625, 'completions/mean_terminated_length': 6846.57958984375, 'completions/min_terminated_length': 1643.0, 'completions/max_terminated_length': 15728.0, 'rewards/accuracy_reward/mean': 0.28125, 'rewards/accuracy_reward/std': 0.4513758420944214, 'reward': 0.28125, 'reward_std': 0.14123955368995667, 'frac_reward_zero_std': 0.6875, 'sampling/sampling_logp_difference/mean': 0.021924620494246483, 'sampling/sampling_logp_difference/max': 9.499988555908203, 'sampling/importance_sampling_ratio/min': 7.485268724849448e-05, 'sampling/importance_sampling_ratio/mean': 1.000006079673767, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 1.0373736545443535, 'clip_ratio/low_mean': 3.0928131309337914e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 8.542219234186632e-07, 'clip_ratio/high_max': 3.416887693674653e-06, 'clip_ratio/region_mean': 3.178235323275658e-05, 'epoch': 0.65} + + 69%|██████▉ | 707/1024 [32:28:06<14:16:36, 162.14s/it]INFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:53:06 [block_pool.py:292] Successfully reset prefix cache + + 69%|██████▉ | 708/1024 [32:31:19<15:02:33, 171.37s/it] + {'loss': 0.0458, 'grad_norm': 0.0032085489947348833, 'learning_rate': 1e-05, 'num_tokens': 625759543.0, 'completions/mean_length': 7361.28125, 'completions/min_length': 832.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.1015625, 'completions/mean_terminated_length': 6341.3212890625, 'completions/min_terminated_length': 832.0, 'completions/max_terminated_length': 16279.0, 'rewards/accuracy_reward/mean': 0.4375, 'rewards/accuracy_reward/std': 0.49802759289741516, 'reward': 0.4375, 'reward_std': 0.27062684297561646, 'frac_reward_zero_std': 0.4375, 'sampling/sampling_logp_difference/mean': 0.018720708787441254, 'sampling/sampling_logp_difference/max': 6.406182765960693, 'sampling/importance_sampling_ratio/min': 0.001651315949857235, 'sampling/importance_sampling_ratio/mean': 1.0000088214874268, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8065696209669113, 'clip_ratio/low_mean': 5.126845326230978e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 6.276005024119513e-06, 'clip_ratio/high_max': 2.134235910489224e-05, 'clip_ratio/region_mean': 5.754445828642929e-05, 'epoch': 0.65} + + 69%|██████▉ | 708/1024 [32:31:19<15:02:33, 171.37s/it]INFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:56:19 [block_pool.py:292] Successfully reset prefix cache + + 69%|██████▉ | 709/1024 [32:33:56<14:36:38, 166.98s/it] + {'loss': 0.0283, 'grad_norm': 0.0034961337223649025, 'learning_rate': 1e-05, 'num_tokens': 626602944.0, 'completions/mean_length': 6415.4453125, 'completions/min_length': 890.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0078125, 'completions/mean_terminated_length': 6336.95263671875, 'completions/min_terminated_length': 890.0, 'completions/max_terminated_length': 15793.0, 'rewards/accuracy_reward/mean': 0.3125, 'rewards/accuracy_reward/std': 0.4653336703777313, 'reward': 0.3125, 'reward_std': 0.2709311842918396, 'frac_reward_zero_std': 0.375, 'sampling/sampling_logp_difference/mean': 0.02052130736410618, 'sampling/sampling_logp_difference/max': 11.249995231628418, 'sampling/importance_sampling_ratio/min': 1.3007359484618064e-05, 'sampling/importance_sampling_ratio/mean': 0.9999567866325378, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.9348134994506836, 'clip_ratio/low_mean': 3.820702841039747e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 2.349654664612899e-06, 'clip_ratio/high_max': 9.398618658451596e-06, 'clip_ratio/region_mean': 4.055668296132353e-05, 'epoch': 0.65} + + 69%|██████▉ | 709/1024 [32:33:56<14:36:38, 166.98s/it]INFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 21:58:56 [block_pool.py:292] Successfully reset prefix cache + + 69%|██████▉ | 710/1024 [32:37:01<15:02:02, 172.36s/it] + {'loss': 0.0559, 'grad_norm': 0.004416701849550009, 'learning_rate': 1e-05, 'num_tokens': 627629595.0, 'completions/mean_length': 7879.3359375, 'completions/min_length': 592.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0859375, 'completions/mean_terminated_length': 7079.75244140625, 'completions/min_terminated_length': 592.0, 'completions/max_terminated_length': 15440.0, 'rewards/accuracy_reward/mean': 0.4140625, 'rewards/accuracy_reward/std': 0.49449479579925537, 'reward': 0.4140625, 'reward_std': 0.31930169463157654, 'frac_reward_zero_std': 0.3125, 'sampling/sampling_logp_difference/mean': 0.018000833690166473, 'sampling/sampling_logp_difference/max': 8.248465538024902, 'sampling/importance_sampling_ratio/min': 0.0002616597630549222, 'sampling/importance_sampling_ratio/mean': 0.9999274015426636, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.7092025354504585, 'clip_ratio/low_mean': 5.279648712530616e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 7.977412849413668e-06, 'clip_ratio/high_max': 3.190965139765467e-05, 'clip_ratio/region_mean': 6.077389980418957e-05, 'epoch': 0.65} + + 69%|██████▉ | 710/1024 [32:37:01<15:02:02, 172.36s/it]INFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 22:02:00 [block_pool.py:292] Successfully reset prefix cache + + 69%|██████▉ | 711/1024 [32:39:38<14:35:45, 167.88s/it] + {'loss': 0.0813, 'grad_norm': 0.0047090682201087475, 'learning_rate': 1e-05, 'num_tokens': 628409064.0, 'completions/mean_length': 5936.2890625, 'completions/min_length': 491.0, 'completions/max_length': 16384.0, 'completions/clipped_ratio': 0.0234375, 'completions/mean_terminated_length': 5685.54443359375, 'completions/min_terminated_length': 491.0, 'completions/max_terminated_length': 14801.0, 'rewards/accuracy_reward/mean': 0.515625, 'rewards/accuracy_reward/std': 0.5017194747924805, 'reward': 0.515625, 'reward_std': 0.22119548916816711, 'frac_reward_zero_std': 0.5, 'sampling/sampling_logp_difference/mean': 0.01985335350036621, 'sampling/sampling_logp_difference/max': 7.311888217926025, 'sampling/importance_sampling_ratio/min': 0.0006675553740933537, 'sampling/importance_sampling_ratio/mean': 0.9999631643295288, 'sampling/importance_sampling_ratio/max': 2.0, 'entropy': 0.8411448448896408, 'clip_ratio/low_mean': 5.0108070581700304e-05, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 5.319128604241996e-06, 'clip_ratio/high_max': 2.1276514416967984e-05, 'clip_ratio/region_mean': 5.54271991859423e-05, 'epoch': 0.65} + + 69%|██████▉ | 711/1024 [32:39:38<14:35:45, 167.88s/it]INFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 22:04:38 [block_pool.py:292] Successfully reset prefix cache diff --git a/grpo_dora_7b_20251202_013940/README.md b/grpo_dora_7b_20251202_013940/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2610c1d81840332bfdf7b5c84963fa9d35c5ec12 --- /dev/null +++ b/grpo_dora_7b_20251202_013940/README.md @@ -0,0 +1,68 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +library_name: transformers +model_name: dapo_dora_7b_20251202_013940 +tags: +- generated_from_trainer +- grpo +- trl +licence: license +--- + +# Model Card for dapo_dora_7b_20251202_013940 + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/6dmxhs58) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.25.0 +- Transformers: 4.57.1 +- Pytorch: 2.8.0 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/grpo_dora_7b_20251202_013940/output.log b/grpo_dora_7b_20251202_013940/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5ff56981d2e32cb709ef86cbe9e4c3203d93103e --- /dev/null +++ b/grpo_dora_7b_20251202_013940/output.log @@ -0,0 +1,4721 @@ +W1202 01:40:02.367000 101639 torch/distributed/run.py:774] +W1202 01:40:02.367000 101639 torch/distributed/run.py:774] ***************************************** +W1202 01:40:02.367000 101639 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W1202 01:40:02.367000 101639 torch/distributed/run.py:774] ***************************************** +INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 01:40:24 [__init__.py:216] Automatically detected platform cuda. +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-02 01:40:30,218 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it +[OpenTinker] 2025-12-02 01:40:30,218 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it +[OpenTinker] 2025-12-02 01:40:30,218 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_dora_7b_20251202_013940', run_name='outputs/dapo_dora_7b_20251202_013940', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=2, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-02 01:40:30,221 - root - INFO - Output directory outputs/dapo_dora_7b_20251202_013940 already exists, using it +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: setting up run i7o9cken +wandb: setting up run jco9wivt +wandb: setting up run 6dmxhs58 +wandb: setting up run 9u4d73kf +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-i7o9cken +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_dora_7b_20251202_013940 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/i7o9cken +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-jco9wivt +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_dora_7b_20251202_013940 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/jco9wivt +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 01:40:35,900 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 01:40:35,900 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-6dmxhs58 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_dora_7b_20251202_013940 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/6dmxhs58 +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 01:40:36,151 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 01:40:36,151 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_014033-9u4d73kf +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_dora_7b_20251202_013940 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/9u4d73kf +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 01:40:36,297 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 01:40:36,297 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 01:40:36,446 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 01:40:36,447 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +[OpenTinker] 2025-12-02 01:40:37,277 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 01:40:37,397 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 01:40:37,473 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 01:40:37,656 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 01:40:40,347 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +[OpenTinker] 2025-12-02 01:40:40,404 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +[OpenTinker] 2025-12-02 01:40:40,483 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +[OpenTinker] 2025-12-02 01:40:40,573 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! + + Loading checkpoint shards: 0%| | 0/2 [00:00 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0> +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0> +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0> +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO ncclCommInitRankConfig comm 0x1f8d2890 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x1a9a18bbe7484081 - Init START +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e401220 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x1a9a18bbe7484081 - Init START +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO ncclCommInitRankConfig comm 0x1e31d2c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x1a9a18bbe7484081 - Init START +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1c5c60 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x1a9a18bbe7484081 - Init START +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Bootstrap timings total 0.001227 (create 0.000018, send 0.000094, recv 0.000685, ring 0.000150, delay 0.000000) +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Bootstrap timings total 0.005961 (create 0.000022, send 0.000079, recv 0.004801, ring 0.000737, delay 0.000001) +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Bootstrap timings total 0.000679 (create 0.000020, send 0.000094, recv 0.000180, ring 0.000067, delay 0.000000) +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Bootstrap timings total 0.003019 (create 0.000020, send 0.000097, recv 0.000073, ring 0.000096, delay 0.000000) +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO comm 0x1f1c5c60 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO comm 0x1e401220 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO comm 0x1f8d2890 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO comm 0x1e31d2c0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:101885:102984 [2] NCCL INFO [Proxy Service] Device 2 CPU core 151 +lshn-qs-g2ri-2:101885:102985 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 69 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-g2ri-2:101883:102987 [0] NCCL INFO [Proxy Service] Device 0 CPU core 166 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:101886:102986 [3] NCCL INFO [Proxy Service] Device 3 CPU core 87 +lshn-qs-g2ri-2:101886:102988 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 90 +lshn-qs-g2ri-2:101883:102989 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 169 +lshn-qs-g2ri-2:101884:102990 [1] NCCL INFO [Proxy Service] Device 1 CPU core 147 +lshn-qs-g2ri-2:101884:102991 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 170 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1c5c60 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x1a9a18bbe7484081 - Init COMPLETE +lshn-qs-g2ri-2:101885:102978 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.90 (kernels 0.16, alloc 0.57, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.08, rest 0.04) +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO ncclCommInitRankConfig comm 0x1e31d2c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x1a9a18bbe7484081 - Init COMPLETE +lshn-qs-g2ri-2:101884:102979 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.89 (kernels 0.16, alloc 0.57, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.08, rest 0.04) +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO ncclCommInitRankConfig comm 0x1f8d2890 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x1a9a18bbe7484081 - Init COMPLETE +lshn-qs-g2ri-2:101883:102976 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.92 (kernels 0.16, alloc 0.59, bootstrap 0.01, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.08, rest 0.04) +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e401220 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x1a9a18bbe7484081 - Init COMPLETE +lshn-qs-g2ri-2:101886:102977 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.90 (kernels 0.16, alloc 0.57, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.01, connections 0.09, rest 0.03) +[OpenTinker] 2025-12-02 01:40:52,806 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 01:40:52,809 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 01:40:52,812 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 01:40:52,827 - root - INFO - Training model with GRPO +INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'} +INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'} +INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'} +INFO 12-02 01:40:53 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 16, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'} +INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896 +INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896 +INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896 +INFO 12-02 01:41:09 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 01:41:09 [__init__.py:1815] Using max model len 16896 +INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 01:41:10 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 01:41:11 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null} +INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null} +INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null} +INFO 12-02 01:41:13 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null} +[rank2]:[W1202 01:41:14.565456558 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +[rank1]:[W1202 01:41:14.662468277 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +[rank0]:[W1202 01:41:14.662551621 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +[rank3]:[W1202 01:41:14.682452457 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO ncclCommSplit comm 0x20f332d0 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 1 color 2003953581 key 0- Init START +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO ncclCommSplit comm 0x20250b20 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 1 color 2003953581 key 2- Init START +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO ncclCommSplit comm 0x1f3cce90 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 1 color 2003953581 key 1- Init START +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO ncclCommSplit comm 0x1f52d880 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 1 color 2003953581 key 3- Init START +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO comm 0x20250b20 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO comm 0x1f3cce90 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO comm 0x20f332d0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO comm 0x1f52d880 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103132 [1] NCCL INFO [Proxy Service] Device 1 CPU core 54 +lshn-qs-g2ri-2:101884:103133 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 65 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-g2ri-2:101883:103134 [0] NCCL INFO [Proxy Service] Device 0 CPU core 181 +lshn-qs-g2ri-2:101883:103135 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 182 +lshn-qs-g2ri-2:101885:103136 [2] NCCL INFO [Proxy Service] Device 2 CPU core 183 +lshn-qs-g2ri-2:101885:103137 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 90 +lshn-qs-g2ri-2:101886:103138 [3] NCCL INFO [Proxy Service] Device 3 CPU core 91 +lshn-qs-g2ri-2:101886:103139 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 188 +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO ncclCommSplit comm 0x20250b20 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 1 color 2003953581 key 2 - Init COMPLETE +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO ncclCommSplit comm 0x20f332d0 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 1 color 2003953581 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO ncclCommSplit comm 0x1f3cce90 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 1 color 2003953581 key 1 - Init COMPLETE +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO ncclCommSplit comm 0x1f52d880 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 1 color 2003953581 key 3 - Init COMPLETE +lshn-qs-g2ri-2:101885:103122 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.20 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.13) +lshn-qs-g2ri-2:101883:103128 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.11 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.03) +lshn-qs-g2ri-2:101884:103127 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.11 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.03) +lshn-qs-g2ri-2:101886:103131 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.03, graphs 0.01, connections 0.02, rest 0.01) +[Gloo] Rank 1 is connected to 3[Gloo] Rank 0 peer ranks. Expected number of connected peer ranks is : is connected to 33 + peer ranks. Expected number of connected peer ranks is : [Gloo] Rank [Gloo] Rank 23 +3 is connected to is connected to 33 peer ranks. peer ranks. Expected number of connected peer ranks is : Expected number of connected peer ranks is : 33 + +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO ncclCommSplit comm 0x21044ab0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 2 color 59908776 key 0- Init START +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO comm 0x21044ab0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101883:103162 [0] NCCL INFO [Proxy Service] Device 0 CPU core 51 +lshn-qs-g2ri-2:101883:103163 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 73 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO ncclCommSplit comm 0x21044ab0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 2 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101883:103158 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO ncclCommSplit comm 0x1f4e1200 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 4 color 440515407 key 0- Init START +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO comm 0x1f4e1200 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101884:103177 [1] NCCL INFO [Proxy Service] Device 1 CPU core 170 +lshn-qs-g2ri-2:101884:103178 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 177 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO ncclCommSplit comm 0x1f4e1200 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 4 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101884:103173 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO ncclCommSplit comm 0x203652a0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 6 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO comm 0x203652a0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101885:103192 [2] NCCL INFO [Proxy Service] Device 2 CPU core 176 +lshn-qs-g2ri-2:101885:103193 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 190 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO ncclCommSplit comm 0x203652a0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 6 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101885:103188 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO ncclCommSplit comm 0x1f641820 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 8 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO comm 0x1f641820 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101886:103209 [3] NCCL INFO [Proxy Service] Device 3 CPU core 62 +lshn-qs-g2ri-2:101886:103210 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 65 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO ncclCommSplit comm 0x1f641820 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 8 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101886:103203 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO ncclCommSplit comm 0x227a2cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 9 color 59908776 key 0- Init START +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO comm 0x227a2cc0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101883:103218 [0] NCCL INFO [Proxy Service] Device 0 CPU core 64 +lshn-qs-g2ri-2:101883:103219 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 58 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO ncclCommSplit comm 0x227a2cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 9 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101883:103208 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO ncclCommSplit comm 0x20c3aa70 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 11 color 440515407 key 0- Init START +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO comm 0x20c3aa70 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101884:103233 [1] NCCL INFO [Proxy Service] Device 1 CPU core 183 +lshn-qs-g2ri-2:101884:103234 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 165 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO ncclCommSplit comm 0x20c3aa70 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 11 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101884:103229 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO ncclCommSplit comm 0x21ac25f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 13 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO comm 0x21ac25f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101885:103248 [2] NCCL INFO [Proxy Service] Device 2 CPU core 78 +lshn-qs-g2ri-2:101885:103249 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 170 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO ncclCommSplit comm 0x21ac25f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 13 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101885:103244 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO ncclCommSplit comm 0x20d8aea0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 15 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO comm 0x20d8aea0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101886:103265 [3] NCCL INFO [Proxy Service] Device 3 CPU core 168 +lshn-qs-g2ri-2:101886:103266 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 66 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO ncclCommSplit comm 0x20d8aea0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 15 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101886:103259 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO ncclCommSplit comm 0x228aa8d0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 16 color 59908776 key 0- Init START +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO comm 0x228aa8d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101883:103274 [0] NCCL INFO [Proxy Service] Device 0 CPU core 54 +lshn-qs-g2ri-2:101883:103275 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 67 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO ncclCommSplit comm 0x228aa8d0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 16 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101883:103264 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO ncclCommSplit comm 0x20d42680 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 18 color 440515407 key 0- Init START +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO comm 0x20d42680 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101884:103289 [1] NCCL INFO [Proxy Service] Device 1 CPU core 79 +lshn-qs-g2ri-2:101884:103290 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 169 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO ncclCommSplit comm 0x20d42680 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 18 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101884:103285 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO ncclCommSplit comm 0x21bca200 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 20 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO comm 0x21bca200 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101885:103304 [2] NCCL INFO [Proxy Service] Device 2 CPU core 177 +lshn-qs-g2ri-2:101885:103305 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 188 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO ncclCommSplit comm 0x21bca200 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 20 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101885:103300 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.08, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO ncclCommSplit comm 0x20e92ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 22 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO comm 0x20e92ab0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101886:103321 [3] NCCL INFO [Proxy Service] Device 3 CPU core 58 +lshn-qs-g2ri-2:101886:103322 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 180 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO ncclCommSplit comm 0x20e92ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 22 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101886:103315 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO ncclCommSplit comm 0x229b24e0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 23 color 59908776 key 0- Init START +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO comm 0x229b24e0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101883:103330 [0] NCCL INFO [Proxy Service] Device 0 CPU core 183 +lshn-qs-g2ri-2:101883:103331 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 165 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO ncclCommSplit comm 0x229b24e0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 23 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101883:103320 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO ncclCommSplit comm 0x20e4a290 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 25 color 440515407 key 0- Init START +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO comm 0x20e4a290 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101884:103345 [1] NCCL INFO [Proxy Service] Device 1 CPU core 57 +lshn-qs-g2ri-2:101884:103346 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 60 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO ncclCommSplit comm 0x20e4a290 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 25 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101884:103341 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO ncclCommSplit comm 0x21cd1e10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 27 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO comm 0x21cd1e10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101885:103360 [2] NCCL INFO [Proxy Service] Device 2 CPU core 86 +lshn-qs-g2ri-2:101885:103361 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 190 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO ncclCommSplit comm 0x21cd1e10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 27 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101885:103356 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO ncclCommSplit comm 0x20f9a6c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 29 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO comm 0x20f9a6c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101886:103377 [3] NCCL INFO [Proxy Service] Device 3 CPU core 50 +lshn-qs-g2ri-2:101886:103378 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 54 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO ncclCommSplit comm 0x20f9a6c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 29 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101886:103371 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO ncclCommSplit comm 0x22aba0f0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 30 color 59908776 key 0- Init START +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO comm 0x22aba0f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101883:103386 [0] NCCL INFO [Proxy Service] Device 0 CPU core 67 +lshn-qs-g2ri-2:101883:103387 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 177 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO ncclCommSplit comm 0x22aba0f0 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 30 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101883:103376 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO ncclCommSplit comm 0x20f51ea0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 32 color 440515407 key 0- Init START +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO comm 0x20f51ea0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101884:103401 [1] NCCL INFO [Proxy Service] Device 1 CPU core 181 +lshn-qs-g2ri-2:101884:103402 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 84 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO ncclCommSplit comm 0x20f51ea0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 32 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101884:103397 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.18 (kernels 0.00, alloc 0.04, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.12, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO ncclCommSplit comm 0x21dd9a20 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 34 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO comm 0x21dd9a20 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101885:103416 [2] NCCL INFO [Proxy Service] Device 2 CPU core 172 +lshn-qs-g2ri-2:101885:103417 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 72 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO ncclCommSplit comm 0x21dd9a20 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 34 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101885:103412 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Using network Socket +INFO 12-02 01:41:15 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 01:41:15 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 01:41:15 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO ncclCommSplit comm 0x210a22d0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 36 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO comm 0x210a22d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:101886:103428 [3] NCCL INFO [Proxy Service] Device 3 CPU core 158 +lshn-qs-g2ri-2:101886:103429 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 68 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO ncclCommSplit comm 0x210a22d0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 36 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101886:103427 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +INFO 12-02 01:41:15 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B... +INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B... +INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B... +INFO 12-02 01:41:16 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B... +INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 01:41:16 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 01:41:16 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 01:41:17 [weight_utils.py:348] Using model weights format ['*.safetensors'] + + Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM +INFO 12-02 01:41:56 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-02 01:41:56 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM +INFO 12-02 01:41:57 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-02 01:41:57 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 00/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM +INFO 12-02 01:41:57 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-02 01:41:57 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103551 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:101884:103554 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:101883:103553 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:101886:103552 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +[OpenTinker] 2025-12-02 01:42:00,702 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value. +lshn-qs-g2ri-2:101885:101885 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101884:101884 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101883:101883 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101886:101886 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO ncclCommSplit comm 0x1c1970c0 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 37 color 2003953581 key 2- Init START +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO ncclCommSplit comm 0x55f32550 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 37 color 2003953581 key 0- Init START +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO ncclCommSplit comm 0x1b29dc00 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 37 color 2003953581 key 1- Init START +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO ncclCommSplit comm 0x1b438a80 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 37 color 2003953581 key 3- Init START +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO comm 0x1b29dc00 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO comm 0x55f32550 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO comm 0x1b438a80 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO comm 0x1c1970c0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:101884:103571 [1] NCCL INFO [Proxy Service] Device 1 CPU core 68 +lshn-qs-g2ri-2:101884:103572 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 166 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-g2ri-2:101883:103573 [0] NCCL INFO [Proxy Service] Device 0 CPU core 146 +lshn-qs-g2ri-2:101883:103574 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 149 +lshn-qs-g2ri-2:101885:103575 [2] NCCL INFO [Proxy Service] Device 2 CPU core 168 +lshn-qs-g2ri-2:101885:103576 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 169 +lshn-qs-g2ri-2:101886:103577 [3] NCCL INFO [Proxy Service] Device 3 CPU core 77 +lshn-qs-g2ri-2:101886:103578 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 78 +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO ncclCommSplit comm 0x1c1970c0 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1f1c5c60 splitCount 37 color 2003953581 key 2 - Init COMPLETE +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO ncclCommSplit comm 0x55f32550 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1f8d2890 splitCount 37 color 2003953581 key 0 - Init COMPLETE +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO ncclCommSplit comm 0x1b29dc00 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1e31d2c0 splitCount 37 color 2003953581 key 1 - Init COMPLETE +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO ncclCommSplit comm 0x1b438a80 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1e401220 splitCount 37 color 2003953581 key 3 - Init COMPLETE +lshn-qs-g2ri-2:101885:103561 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.44 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.37) +lshn-qs-g2ri-2:101883:103567 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.02) +lshn-qs-g2ri-2:101884:103564 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.33 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.26) +lshn-qs-g2ri-2:101886:103570 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.03, rest 0.02) +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 00/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 00/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-g2ri-2:101883:103579 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:101886:103580 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:101885:103581 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:101884:103582 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +INFO 12-02 01:42:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:42:04 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 01:42:04 [block_pool.py:292] Successfully reset prefix cache +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + + 0%| | 0/1024 [00:00](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8qozoeij) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- PEFT 0.17.1 +- TRL: 0.25.0 +- Transformers: 4.57.1 +- Pytorch: 2.8.0 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/grpo_lora_20251130_192918/adapter_config.json b/grpo_lora_20251130_192918/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da884a8eb3c02a42d08fe869da98a8ad4366197d --- /dev/null +++ b/grpo_lora_20251130_192918/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "o_proj", + "down_proj", + "up_proj", + "k_proj", + "gate_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/grpo_lora_20251130_192918/chat_template.jinja b/grpo_lora_20251130_192918/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/grpo_lora_20251130_192918/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/grpo_lora_20251130_192918/checkpoint-1024/adapter_config.json b/grpo_lora_20251130_192918/checkpoint-1024/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..da884a8eb3c02a42d08fe869da98a8ad4366197d --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-1024/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "o_proj", + "down_proj", + "up_proj", + "k_proj", + "gate_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/grpo_lora_20251130_192918/checkpoint-1024/chat_template.jinja b/grpo_lora_20251130_192918/checkpoint-1024/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-1024/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/grpo_lora_20251130_192918/checkpoint-1024/latest b/grpo_lora_20251130_192918/checkpoint-1024/latest new file mode 100644 index 0000000000000000000000000000000000000000..97fe0c3f1bf7645f1b3a8c4e0727a37322abbea2 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-1024/latest @@ -0,0 +1 @@ +global_step1024 \ No newline at end of file diff --git a/grpo_lora_20251130_192918/checkpoint-1024/special_tokens_map.json b/grpo_lora_20251130_192918/checkpoint-1024/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-1024/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/grpo_lora_20251130_192918/checkpoint-1024/tokenizer_config.json b/grpo_lora_20251130_192918/checkpoint-1024/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-1024/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/grpo_lora_20251130_192918/checkpoint-1024/zero_to_fp32.py b/grpo_lora_20251130_192918/checkpoint-1024/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-1024/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/grpo_lora_20251130_192918/checkpoint-128/README.md b/grpo_lora_20251130_192918/checkpoint-128/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-128/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/grpo_lora_20251130_192918/checkpoint-128/chat_template.jinja b/grpo_lora_20251130_192918/checkpoint-128/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-128/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/grpo_lora_20251130_192918/checkpoint-128/latest b/grpo_lora_20251130_192918/checkpoint-128/latest new file mode 100644 index 0000000000000000000000000000000000000000..b4db7fb020d9ef75e52048bf0cde7481e3ef9351 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-128/latest @@ -0,0 +1 @@ +global_step128 \ No newline at end of file diff --git a/grpo_lora_20251130_192918/checkpoint-128/special_tokens_map.json b/grpo_lora_20251130_192918/checkpoint-128/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/grpo_lora_20251130_192918/checkpoint-128/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/grpo_lora_20251130_192918/output.log b/grpo_lora_20251130_192918/output.log new file mode 100644 index 0000000000000000000000000000000000000000..41e87109158b6707615cd680febbbbdcaad4da05 --- /dev/null +++ b/grpo_lora_20251130_192918/output.log @@ -0,0 +1,13922 @@ +W1130 19:29:41.689000 398113 torch/distributed/run.py:774] +W1130 19:29:41.689000 398113 torch/distributed/run.py:774] ***************************************** +W1130 19:29:41.689000 398113 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W1130 19:29:41.689000 398113 torch/distributed/run.py:774] ***************************************** +INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda. +INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda. +INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda. +INFO 11-30 19:30:03 [__init__.py:216] Automatically detected platform cuda. +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) + +[OpenTinker] 2025-11-30 19:30:09,846 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it +[OpenTinker] 2025-11-30 19:30:09,846 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it +[OpenTinker] 2025-11-30 19:30:09,846 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora', use_peft=True, task_type='CAUSAL_LM', r=16, lora_alpha=32, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dr_grpo_lora_20251130_192918', run_name='outputs/dr_grpo_lora_20251130_192918', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=True, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='grpo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-11-30 19:30:09,849 - root - INFO - Output directory outputs/dr_grpo_lora_20251130_192918 already exists, using it +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: setting up run 8qozoeij +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-8qozoeij +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dr_grpo_lora_20251130_192918 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8qozoeij +wandb: setting up run hblruoay +wandb: setting up run axfzdypj +wandb: setting up run 56oyy2tp +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-11-30 19:30:16,189 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-11-30 19:30:16,189 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-hblruoay +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dr_grpo_lora_20251130_192918 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/hblruoay +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-56oyy2tp +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dr_grpo_lora_20251130_192918 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/56oyy2tp +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251130_193013-axfzdypj +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dr_grpo_lora_20251130_192918 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/axfzdypj +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-11-30 19:30:16,546 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-11-30 19:30:16,546 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-11-30 19:30:16,592 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-11-30 19:30:16,592 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-11-30 19:30:16,632 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-11-30 19:30:16,632 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-11-30 19:30:17,510 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-11-30 19:30:17,951 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-11-30 19:30:17,981 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-11-30 19:30:17,984 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-11-30 19:30:21,260 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-11-30 19:30:21,412 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-11-30 19:30:21,466 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +`torch_dtype` is deprecated! Use `dtype` instead! +[OpenTinker] 2025-11-30 19:30:21,785 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +[OpenTinker] 2025-11-30 19:30:22,855 - root - INFO - Model loaded successfully +[OpenTinker] 2025-11-30 19:30:22,856 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-11-30 19:30:22,883 - root - INFO - Model loaded successfully +[OpenTinker] 2025-11-30 19:30:22,884 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-11-30 19:30:22,893 - root - INFO - Model loaded successfully +[OpenTinker] 2025-11-30 19:30:22,894 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-11-30 19:30:23,079 - root - INFO - Lora configured successfully +[OpenTinker] 2025-11-30 19:30:23,111 - root - INFO - Lora configured successfully +[OpenTinker] 2025-11-30 19:30:23,112 - root - INFO - Lora configured successfully +[OpenTinker] 2025-11-30 19:30:23,177 - root - INFO - Model loaded successfully +[OpenTinker] 2025-11-30 19:30:23,177 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-11-30 19:30:23,403 - root - INFO - Lora configured successfully +[OpenTinker] 2025-11-30 19:30:23,567 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpjzmx7k6g/test.c -o /tmp/tmpjzmx7k6g/test.o +[OpenTinker] 2025-11-30 19:30:23,567 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp5gptor5t/test.c -o /tmp/tmp5gptor5t/test.o +[OpenTinker] 2025-11-30 19:30:23,567 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpx1zh486p/test.c -o /tmp/tmpx1zh486p/test.o +[OpenTinker] 2025-11-30 19:30:23,594 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpjzmx7k6g/test.o -laio -o /tmp/tmpjzmx7k6g/a.out +[OpenTinker] 2025-11-30 19:30:23,605 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp5gptor5t/test.o -laio -o /tmp/tmp5gptor5t/a.out +[OpenTinker] 2025-11-30 19:30:23,605 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpx1zh486p/test.o -laio -o /tmp/tmpx1zh486p/a.out +[OpenTinker] 2025-11-30 19:30:23,743 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpbwojjnr2/test.c -o /tmp/tmpbwojjnr2/test.o +[OpenTinker] 2025-11-30 19:30:23,771 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpbwojjnr2/test.o -laio -o /tmp/tmpbwojjnr2/a.out +[OpenTinker] 2025-11-30 19:30:24,099 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp89odbgc6/test.c -o /tmp/tmp89odbgc6/test.o +[OpenTinker] 2025-11-30 19:30:24,118 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpq77yq7ci/test.c -o /tmp/tmpq77yq7ci/test.o +[OpenTinker] 2025-11-30 19:30:24,137 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpfom6ln06/test.c -o /tmp/tmpfom6ln06/test.o +[OpenTinker] 2025-11-30 19:30:24,149 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp89odbgc6/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp89odbgc6/a.out +[OpenTinker] 2025-11-30 19:30:24,162 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpq77yq7ci/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpq77yq7ci/a.out +[OpenTinker] 2025-11-30 19:30:24,175 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpfom6ln06/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpfom6ln06/a.out +[OpenTinker] 2025-11-30 19:30:24,333 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmph6jzh41r/test.c -o /tmp/tmph6jzh41r/test.o +[OpenTinker] 2025-11-30 19:30:24,359 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmph6jzh41r/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmph6jzh41r/a.out +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO cudaDriverVersion 12090 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO cudaDriverVersion 12090 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO cudaDriverVersion 12090 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO cudaDriverVersion 12090 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Bootstrap: Using eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Initialized NET plugin Socket +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Initialized NET plugin Socket +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Initialized NET plugin Socket +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.233.174<0> +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Initialized NET plugin Socket +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO ncclCommInitRankConfig comm 0x19032270 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x5f2965acf2c07fe9 - Init START +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO ncclCommInitRankConfig comm 0x191e30a0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x5f2965acf2c07fe9 - Init START +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO ncclCommInitRankConfig comm 0x1a5a5680 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x5f2965acf2c07fe9 - Init START +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO ncclCommInitRankConfig comm 0x18ac50d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x5f2965acf2c07fe9 - Init START +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Bootstrap timings total 0.000784 (create 0.000019, send 0.000101, recv 0.000107, ring 0.000172, delay 0.000001) +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Bootstrap timings total 0.051831 (create 0.000022, send 0.000091, recv 0.018422, ring 0.000055, delay 0.000001) +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Bootstrap timings total 0.002087 (create 0.000020, send 0.000764, recv 0.000689, ring 0.000199, delay 0.000001) +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Bootstrap timings total 0.033559 (create 0.000024, send 0.000095, recv 0.032242, ring 0.000593, delay 0.000001) +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO comm 0x191e30a0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO comm 0x19032270 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO comm 0x1a5a5680 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO comm 0x18ac50d0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-e9wz-2:398357:399452 [3] NCCL INFO [Proxy Service] Device 3 CPU core 26 +lshn-qs-e9wz-2:398357:399453 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 32 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-e9wz-2:398355:399454 [1] NCCL INFO [Proxy Service] Device 1 CPU core 120 +lshn-qs-e9wz-2:398355:399455 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 34 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-e9wz-2:398354:399456 [0] NCCL INFO [Proxy Service] Device 0 CPU core 45 +lshn-qs-e9wz-2:398354:399457 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 11 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-e9wz-2:398356:399458 [2] NCCL INFO [Proxy Service] Device 2 CPU core 111 +lshn-qs-e9wz-2:398356:399459 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 42 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO ncclCommInitRankConfig comm 0x191e30a0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x5f2965acf2c07fe9 - Init COMPLETE +lshn-qs-e9wz-2:398356:399444 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.95 (kernels 0.18, alloc 0.61, bootstrap 0.03, allgathers 0.00, topo 0.05, graphs 0.01, connections 0.05, rest 0.03) +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO ncclCommInitRankConfig comm 0x19032270 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x5f2965acf2c07fe9 - Init COMPLETE +lshn-qs-e9wz-2:398355:399445 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.95 (kernels 0.17, alloc 0.59, bootstrap 0.05, allgathers 0.00, topo 0.05, graphs 0.01, connections 0.04, rest 0.03) +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO ncclCommInitRankConfig comm 0x18ac50d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x5f2965acf2c07fe9 - Init COMPLETE +lshn-qs-e9wz-2:398354:399447 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.89 (kernels 0.19, alloc 0.56, bootstrap 0.00, allgathers 0.01, topo 0.05, graphs 0.01, connections 0.04, rest 0.04) +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO ncclCommInitRankConfig comm 0x1a5a5680 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x5f2965acf2c07fe9 - Init COMPLETE +lshn-qs-e9wz-2:398357:399446 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.90 (kernels 0.16, alloc 0.59, bootstrap 0.00, allgathers 0.00, topo 0.05, graphs 0.01, connections 0.05, rest 0.03) +[OpenTinker] 2025-11-30 19:30:28,208 - root - INFO - Training model with GRPO +[OpenTinker] 2025-11-30 19:30:28,212 - root - INFO - Training model with GRPO +[OpenTinker] 2025-11-30 19:30:28,227 - root - INFO - Training model with GRPO +[OpenTinker] 2025-11-30 19:30:28,237 - root - INFO - Training model with GRPO +[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {} +[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {} +[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {} +[OpenTinker] 2025-11-30 19:30:31,647 - liger_kernel.transformers.monkey_patch - INFO - Applying Liger kernels to model instance with model type: qwen2 with kwargs: {} +INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 11-30 19:30:31 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896 +INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896 +INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896 +INFO 11-30 19:30:48 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 11-30 19:30:48 [__init__.py:1815] Using max model len 16896 +INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 11-30 19:30:49 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 11-30 19:30:50 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 11-30 19:30:52 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +[rank2]:[W1130 19:30:53.106048450 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +[rank1]:[W1130 19:30:53.127200306 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +[rank3]:[W1130 19:30:53.134440865 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +[rank0]:[W1130 19:30:53.175332231 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO ncclCommSplit comm 0x1b723300 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 1 color 2003953581 key 3- Init START +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO ncclCommSplit comm 0x1a22d200 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 1 color 2003953581 key 1- Init START +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO ncclCommSplit comm 0x1a2a9f20 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 1 color 2003953581 key 0- Init START +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO ncclCommSplit comm 0x1a4553e0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 1 color 2003953581 key 2- Init START +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO comm 0x1a4553e0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO comm 0x1a22d200 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO comm 0x1b723300 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO comm 0x1a2a9f20 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398356:399604 [2] NCCL INFO [Proxy Service] Device 2 CPU core 114 +lshn-qs-e9wz-2:398357:399603 [3] NCCL INFO [Proxy Service] Device 3 CPU core 103 +lshn-qs-e9wz-2:398356:399605 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 115 +lshn-qs-e9wz-2:398357:399606 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 9 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-e9wz-2:398354:399607 [0] NCCL INFO [Proxy Service] Device 0 CPU core 117 +lshn-qs-e9wz-2:398354:399608 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 22 +lshn-qs-e9wz-2:398355:399609 [1] NCCL INFO [Proxy Service] Device 1 CPU core 2 +lshn-qs-e9wz-2:398355:399610 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 25 +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO ncclCommSplit comm 0x1a2a9f20 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 1 color 2003953581 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO ncclCommSplit comm 0x1a4553e0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 1 color 2003953581 key 2 - Init COMPLETE +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO ncclCommSplit comm 0x1a22d200 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 1 color 2003953581 key 1 - Init COMPLETE +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO ncclCommSplit comm 0x1b723300 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 1 color 2003953581 key 3 - Init COMPLETE +lshn-qs-e9wz-2:398354:399602 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-e9wz-2:398356:399592 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.08) +lshn-qs-e9wz-2:398355:399596 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.11 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.05) +lshn-qs-e9wz-2:398357:399599 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.05) +[Gloo] Rank 0 is connected to 3[Gloo] Rank [Gloo] Rank peer ranks. Expected number of connected peer ranks is : [Gloo] Rank 31 is connected to 2 is connected to 3 is connected to 33 peer ranks. 3 peer ranks. + peer ranks. Expected number of connected peer ranks is : Expected number of connected peer ranks is : 3Expected number of connected peer ranks is : 33 + + +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO ncclCommSplit comm 0x1a3beb50 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 2 color 59908776 key 0- Init START +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO comm 0x1a3beb50 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398354:399633 [0] NCCL INFO [Proxy Service] Device 0 CPU core 5 +lshn-qs-e9wz-2:398354:399634 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 108 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO ncclCommSplit comm 0x1a3beb50 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 2 color 59908776 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398354:399628 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.01) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO ncclCommSplit comm 0x1a3418c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 4 color 440515407 key 0- Init START +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO comm 0x1a3418c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398355:399648 [1] NCCL INFO [Proxy Service] Device 1 CPU core 110 +lshn-qs-e9wz-2:398355:399649 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 107 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO ncclCommSplit comm 0x1a3418c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 4 color 440515407 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398355:399644 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO ncclCommSplit comm 0x1a55cff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 6 color 1227022723 key 0- Init START +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO comm 0x1a55cff0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398356:399663 [2] NCCL INFO [Proxy Service] Device 2 CPU core 24 +lshn-qs-e9wz-2:398356:399664 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 36 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO ncclCommSplit comm 0x1a55cff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 6 color 1227022723 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398356:399659 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.05 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.03, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO ncclCommSplit comm 0x1b837da0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 8 color 1301067556 key 0- Init START +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO comm 0x1b837da0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398357:399680 [3] NCCL INFO [Proxy Service] Device 3 CPU core 113 +lshn-qs-e9wz-2:398357:399681 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 41 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO ncclCommSplit comm 0x1b837da0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 8 color 1301067556 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398357:399674 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.06, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO ncclCommSplit comm 0x1bb1b890 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 9 color 59908776 key 0- Init START +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO comm 0x1bb1b890 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398354:399689 [0] NCCL INFO [Proxy Service] Device 0 CPU core 98 +lshn-qs-e9wz-2:398354:399690 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 104 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO ncclCommSplit comm 0x1bb1b890 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 9 color 59908776 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398354:399679 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.12 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.09) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO ncclCommSplit comm 0x1ba8b5c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 11 color 440515407 key 0- Init START +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO comm 0x1ba8b5c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398355:399704 [1] NCCL INFO [Proxy Service] Device 1 CPU core 16 +lshn-qs-e9wz-2:398355:399705 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 115 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO ncclCommSplit comm 0x1ba8b5c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 11 color 440515407 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398355:399700 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO ncclCommSplit comm 0x1bc4c240 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 13 color 1227022723 key 0- Init START +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO comm 0x1bc4c240 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398356:399719 [2] NCCL INFO [Proxy Service] Device 2 CPU core 40 +lshn-qs-e9wz-2:398356:399720 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 121 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO ncclCommSplit comm 0x1bc4c240 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 13 color 1227022723 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398356:399715 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO ncclCommSplit comm 0x1cf940f0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 15 color 1301067556 key 0- Init START +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO comm 0x1cf940f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398357:399736 [3] NCCL INFO [Proxy Service] Device 3 CPU core 143 +lshn-qs-e9wz-2:398357:399737 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 9 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO ncclCommSplit comm 0x1cf940f0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 15 color 1301067556 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398357:399730 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO ncclCommSplit comm 0x1bc234a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 16 color 59908776 key 0- Init START +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO comm 0x1bc234a0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398354:399745 [0] NCCL INFO [Proxy Service] Device 0 CPU core 13 +lshn-qs-e9wz-2:398354:399746 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 15 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO ncclCommSplit comm 0x1bc234a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 16 color 59908776 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398354:399735 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO ncclCommSplit comm 0x1bb931d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 18 color 440515407 key 0- Init START +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO comm 0x1bb931d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398355:399760 [1] NCCL INFO [Proxy Service] Device 1 CPU core 139 +lshn-qs-e9wz-2:398355:399761 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 113 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO ncclCommSplit comm 0x1bb931d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 18 color 440515407 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398355:399756 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO ncclCommSplit comm 0x1bd53e50 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 20 color 1227022723 key 0- Init START +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO comm 0x1bd53e50 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398356:399775 [2] NCCL INFO [Proxy Service] Device 2 CPU core 0 +lshn-qs-e9wz-2:398356:399776 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 114 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO ncclCommSplit comm 0x1bd53e50 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 20 color 1227022723 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398356:399771 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO ncclCommSplit comm 0x1d09bd00 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 22 color 1301067556 key 0- Init START +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO comm 0x1d09bd00 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398357:399792 [3] NCCL INFO [Proxy Service] Device 3 CPU core 101 +lshn-qs-e9wz-2:398357:399793 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 103 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO ncclCommSplit comm 0x1d09bd00 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 22 color 1301067556 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398357:399786 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO ncclCommSplit comm 0x1bd2b0b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 23 color 59908776 key 0- Init START +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO comm 0x1bd2b0b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398354:399801 [0] NCCL INFO [Proxy Service] Device 0 CPU core 16 +lshn-qs-e9wz-2:398354:399802 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 4 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO ncclCommSplit comm 0x1bd2b0b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 23 color 59908776 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398354:399791 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO ncclCommSplit comm 0x1bc9ade0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 25 color 440515407 key 0- Init START +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO comm 0x1bc9ade0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398355:399816 [1] NCCL INFO [Proxy Service] Device 1 CPU core 40 +lshn-qs-e9wz-2:398355:399817 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 25 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO ncclCommSplit comm 0x1bc9ade0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 25 color 440515407 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398355:399812 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO ncclCommSplit comm 0x1be5ba60 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 27 color 1227022723 key 0- Init START +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO comm 0x1be5ba60 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398356:399831 [2] NCCL INFO [Proxy Service] Device 2 CPU core 110 +lshn-qs-e9wz-2:398356:399832 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 104 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO ncclCommSplit comm 0x1be5ba60 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 27 color 1227022723 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398356:399827 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO ncclCommSplit comm 0x1d1a3910 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 29 color 1301067556 key 0- Init START +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO comm 0x1d1a3910 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398357:399848 [3] NCCL INFO [Proxy Service] Device 3 CPU core 6 +lshn-qs-e9wz-2:398357:399849 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 9 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO ncclCommSplit comm 0x1d1a3910 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 29 color 1301067556 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398357:399842 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.07, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO ncclCommSplit comm 0x1be32cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 30 color 59908776 key 0- Init START +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO comm 0x1be32cc0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398354:399857 [0] NCCL INFO [Proxy Service] Device 0 CPU core 117 +lshn-qs-e9wz-2:398354:399858 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 107 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO ncclCommSplit comm 0x1be32cc0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 30 color 59908776 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398354:399847 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.10) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO ncclCommSplit comm 0x1bda29f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 32 color 440515407 key 0- Init START +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO comm 0x1bda29f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398355:399872 [1] NCCL INFO [Proxy Service] Device 1 CPU core 13 +lshn-qs-e9wz-2:398355:399873 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 15 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO ncclCommSplit comm 0x1bda29f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 32 color 440515407 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398355:399868 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO ncclCommSplit comm 0x1bf63670 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 34 color 1227022723 key 0- Init START +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO comm 0x1bf63670 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398356:399887 [2] NCCL INFO [Proxy Service] Device 2 CPU core 5 +lshn-qs-e9wz-2:398356:399888 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 7 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO ncclCommSplit comm 0x1bf63670 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 34 color 1227022723 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398356:399883 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Using network Socket +INFO 11-30 19:30:54 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 11-30 19:30:54 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO ncclCommSplit comm 0x1d2ab520 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 36 color 1301067556 key 0- Init START +INFO 11-30 19:30:54 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO comm 0x1d2ab520 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0-> +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-e9wz-2:398357:399899 [3] NCCL INFO [Proxy Service] Device 3 CPU core 45 +lshn-qs-e9wz-2:398357:399900 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 108 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO ncclCommSplit comm 0x1d2ab520 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 36 color 1301067556 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398357:399898 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +INFO 11-30 19:30:54 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 11-30 19:30:54 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 11-30 19:30:55 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 11-30 19:30:55 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 11-30 19:30:56 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 11-30 19:30:56 [weight_utils.py:406] No model.safetensors.index.json found in remote. + + Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM +INFO 11-30 19:31:13 [llm.py:295] Supported_tasks: ('generate',) +INFO 11-30 19:31:13 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM +INFO 11-30 19:31:13 [llm.py:295] Supported_tasks: ('generate',) +INFO 11-30 19:31:13 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM +INFO 11-30 19:31:13 [llm.py:295] Supported_tasks: ('generate',) +INFO 11-30 19:31:13 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400020 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-e9wz-2:398356:400019 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-e9wz-2:398357:400022 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-e9wz-2:398355:400021 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +[OpenTinker] 2025-11-30 19:31:14,416 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value. +lshn-qs-e9wz-2:398356:398356 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398354:398354 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:398355 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398357:398357 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Using network Socket +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO ncclCommSplit comm 0x4ab09c80 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 37 color 2003953581 key 1- Init START +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO ncclCommSplit comm 0x4f5333d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 37 color 2003953581 key 0- Init START +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO ncclCommSplit comm 0x4c09f530 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 37 color 2003953581 key 3- Init START +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO ncclCommSplit comm 0x4acd8820 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 37 color 2003953581 key 2- Init START +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO comm 0x4acd8820 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO comm 0x4c09f530 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO comm 0x4f5333d0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO comm 0x4ab09c80 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-e9wz-2:398356:400035 [2] NCCL INFO [Proxy Service] Device 2 CPU core 110 +lshn-qs-e9wz-2:398356:400036 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 114 +lshn-qs-e9wz-2:398355:400037 [1] NCCL INFO [Proxy Service] Device 1 CPU core 25 +lshn-qs-e9wz-2:398355:400038 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 125 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-e9wz-2:398354:400039 [0] NCCL INFO [Proxy Service] Device 0 CPU core 32 +lshn-qs-e9wz-2:398354:400040 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 24 +lshn-qs-e9wz-2:398357:400041 [3] NCCL INFO [Proxy Service] Device 3 CPU core 123 +lshn-qs-e9wz-2:398357:400042 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 140 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO ncclCommSplit comm 0x4c09f530 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1a5a5680 splitCount 37 color 2003953581 key 3 - Init COMPLETE +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO ncclCommSplit comm 0x4ab09c80 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x19032270 splitCount 37 color 2003953581 key 1 - Init COMPLETE +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO ncclCommSplit comm 0x4f5333d0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x18ac50d0 splitCount 37 color 2003953581 key 0 - Init COMPLETE +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO ncclCommSplit comm 0x4acd8820 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x191e30a0 splitCount 37 color 2003953581 key 2 - Init COMPLETE +lshn-qs-e9wz-2:398357:400034 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-e9wz-2:398355:400033 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-e9wz-2:398354:400028 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.03, rest 0.01) +lshn-qs-e9wz-2:398356:400025 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.02) +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-e9wz-2:398354:400044 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-e9wz-2:398357:400046 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-e9wz-2:398355:400045 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-e9wz-2:398356:400043 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +INFO 11-30 19:31:15 [block_pool.py:292] Successfully reset prefix cache +INFO 11-30 19:31:15 [block_pool.py:292] Successfully reset prefix cache +INFO 11-30 19:31:15 [block_pool.py:292] Successfully reset prefix cache +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + + 0%| | 0/1024 [00:00 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398356:399458 [2] NCCL INFO misc/socket.cc:915 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398357:459910 [3] NCCL INFO comm 0x1d2ab520 rank 0 nranks 1 cudaDev 3 busId c6000 - Abort COMPLETE +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398356:399458 [2] NCCL INFO misc/socket.cc:915 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398357:399452 [3] NCCL INFO misc/socket.cc:915 -> 3 +lshn-qs-e9wz-2:398356:459916 [2] NCCL INFO comm 0x191e30a0 rank 2 nranks 4 cudaDev 2 busId a2000 - Abort COMPLETE +lshn-qs-e9wz-2:398355:459912 [1] NCCL INFO comm 0x1bda29f0 rank 0 nranks 1 cudaDev 1 busId 7e000 - Abort COMPLETE +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398355:399454 [1] NCCL INFO misc/socket.cc:915 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398354:459914 [0] NCCL INFO comm 0x1bc234a0 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE +lshn-qs-e9wz-2:398357:459918 [3] NCCL INFO comm 0x1a5a5680 rank 3 nranks 4 cudaDev 3 busId c6000 - Abort COMPLETE +lshn-qs-e9wz-2:398355:459920 [1] NCCL INFO comm 0x19032270 rank 1 nranks 4 cudaDev 1 busId 7e000 - Abort COMPLETE +lshn-qs-e9wz-2:398355:400037 [1] NCCL INFO [Service thread] Connection closed by localRank 2 +lshn-qs-e9wz-2:398357:400041 [3] NCCL INFO [Service thread] Connection closed by localRank 2 +lshn-qs-e9wz-2:398354:459922 [0] NCCL INFO comm 0x1bd2b0b0 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE +lshn-qs-e9wz-2:398354:400039 [0] NCCL INFO [Service thread] Connection closed by localRank 3 +lshn-qs-e9wz-2:398354:459925 [0] NCCL INFO comm 0x1be32cc0 rank 0 nranks 1 cudaDev 0 busId 8000 - Abort COMPLETE +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398354:399456 [0] NCCL INFO misc/socket.cc:915 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:64 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:81 -> 3 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO misc/socket.cc:863 -> 3 +lshn-qs-e9wz-2:398354:400039 [0] NCCL INFO [Service thread] Connection closed by localRank 1 +lshn-qs-e9wz-2:398354:459927 [0] NCCL INFO comm 0x18ac50d0 rank 0 nranks 4 cudaDev 0 busId 8000 - Abort COMPLETE diff --git a/grpo_lora_20251130_192918/special_tokens_map.json b/grpo_lora_20251130_192918/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/grpo_lora_20251130_192918/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/grpo_lora_20251130_192918/tokenizer_config.json b/grpo_lora_20251130_192918/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/grpo_lora_20251130_192918/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +}